From 3fdb587d8bb5142e0da79f80d921d57eef842331 Mon Sep 17 00:00:00 2001
From: kvladimi <konstantin.vladimirov@intel.com>
Date: Wed, 20 May 2020 04:31:08 -0700
Subject: [PATCH] Open-source whole Vector Compute backend

Change-Id: Id22b13722d4c79f70e4b0d1629510526d8dcf2e2
---
 IGC/AdaptorOCL/cmc.cpp                        |  144 +
 IGC/AdaptorOCL/cmc.h                          |    8 +
 IGC/AdaptorOCL/dllInterfaceCompute.cpp        |  150 +
 IGC/CMakeLists.txt                            |   24 +
 IGC/VectorCompiler/.gitignore                 |   60 +
 IGC/VectorCompiler/CMakeLists.txt             |  158 +
 IGC/VectorCompiler/cmake/spirv.cmake          |  180 +
 IGC/VectorCompiler/include/CMakeLists.txt     |   18 +
 IGC/VectorCompiler/include/vc/CMakeLists.txt  |    1 +
 .../include/vc/GenXCodeGen/GenXTarget.h       |   42 +
 .../include/vc/GenXCodeGen/GenXWrapper.h      |  142 +
 .../include/vc/GenXOpts/GenXAnalysis.h        |   79 +
 .../include/vc/GenXOpts/GenXOpts.h            |   74 +
 .../include/vc/GenXOpts/Utils/CMRegion.h      |  237 +
 .../include/vc/GenXOpts/Utils/GenXSTLExtras.h |   80 +
 .../include/vc/GenXOpts/Utils/KernelInfo.h    |  356 +
 .../include/vc/GenXOpts/Utils/RegCategory.h   |   55 +
 .../include/vc/Support/CMakeLists.txt         |    3 +
 .../include/vc/Support/Options.h              |   58 +
 .../include/vc/Support/Options.td             |  117 +
 .../include/vc/Support/Status.h               |  158 +
 .../include/vc/Support/StatusCode.h           |   75 +
 .../include/vc/Support/StatusTraits.h         |   85 +
 .../lib/BackendPlugin/BackendPlugin.cpp       |   36 +
 .../lib/BackendPlugin/CMakeLists.txt          |   39 +
 IGC/VectorCompiler/lib/CMakeLists.txt         |    9 +
 .../lib/GenXCodeGen/CMakeLists.txt            |   84 +
 .../lib/GenXCodeGen/FunctionGroup.cpp         |  671 ++
 .../lib/GenXCodeGen/FunctionGroup.h           |  280 +
 IGC/VectorCompiler/lib/GenXCodeGen/GenX.h     |  157 +
 IGC/VectorCompiler/lib/GenXCodeGen/GenX.td    |   87 +
 .../lib/GenXCodeGen/GenXAddressCommoning.cpp  | 1047 +++
 .../GenXAggregatePseudoLowering.cpp           |  366 ++
 .../lib/GenXCodeGen/GenXAlignmentInfo.cpp     |  401 ++
 .../lib/GenXCodeGen/GenXAlignmentInfo.h       |  154 +
 .../lib/GenXCodeGen/GenXAnalysisDumper.cpp    |  144 +
 .../lib/GenXCodeGen/GenXArgIndirection.cpp    | 1822 ++++++
 .../lib/GenXCodeGen/GenXBaling.cpp            | 2365 +++++++
 .../lib/GenXCodeGen/GenXBaling.h              |  550 ++
 .../lib/GenXCodeGen/GenXCFSimplification.cpp  |  354 +
 .../lib/GenXCodeGen/GenXCategory.cpp          | 1060 +++
 .../lib/GenXCodeGen/GenXCisaBuilder.cpp       | 5779 +++++++++++++++++
 .../lib/GenXCodeGen/GenXCoalescing.cpp        | 1759 +++++
 .../lib/GenXCodeGen/GenXConstants.cpp         | 1524 +++++
 .../lib/GenXCodeGen/GenXConstants.h           |  135 +
 .../lib/GenXCodeGen/GenXDeadVectorRemoval.cpp |  746 +++
 .../lib/GenXCodeGen/GenXDepressurizer.cpp     | 1662 +++++
 .../lib/GenXCodeGen/GenXEmulate.cpp           |  174 +
 .../lib/GenXCodeGen/GenXExtractVectorizer.cpp |  295 +
 .../lib/GenXCodeGen/GenXFuncPtrsLowering.cpp  |  364 ++
 .../lib/GenXCodeGen/GenXGEPLowering.cpp       |  324 +
 .../lib/GenXCodeGen/GenXGotoJoin.cpp          |  332 +
 .../lib/GenXCodeGen/GenXGotoJoin.h            |   83 +
 .../GenXCodeGen/GenXIMadPostLegalization.cpp  |  390 ++
 .../lib/GenXCodeGen/GenXInlineAsmLowering.cpp |  345 +
 .../GenXCodeGen/GenXInstCombineCleanup.cpp    |  141 +
 .../lib/GenXCodeGen/GenXIntrinsics.cpp        |  201 +
 .../lib/GenXCodeGen/GenXIntrinsics.h          |  324 +
 .../lib/GenXCodeGen/GenXLayoutBlocks.cpp      |  126 +
 .../lib/GenXCodeGen/GenXLegalization.cpp      | 2613 ++++++++
 .../lib/GenXCodeGen/GenXLiveRanges.cpp        |  215 +
 .../lib/GenXCodeGen/GenXLiveness.cpp          | 1872 ++++++
 .../lib/GenXCodeGen/GenXLiveness.h            |  666 ++
 .../lib/GenXCodeGen/GenXLowerAggrCopies.cpp   |  200 +
 .../lib/GenXCodeGen/GenXLowerAggrCopies.h     |   41 +
 .../lib/GenXCodeGen/GenXLowering.cpp          | 3071 +++++++++
 .../lib/GenXCodeGen/GenXModule.cpp            |  140 +
 .../lib/GenXCodeGen/GenXModule.h              |  185 +
 .../lib/GenXCodeGen/GenXNumbering.cpp         |  392 ++
 .../lib/GenXCodeGen/GenXNumbering.h           |  166 +
 .../lib/GenXCodeGen/GenXOCLInfoExtractor.cpp  |   77 +
 .../lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp    |  292 +
 .../lib/GenXCodeGen/GenXOCLRuntimeInfo.h      |  256 +
 .../lib/GenXCodeGen/GenXPatternMatch.cpp      | 2640 ++++++++
 .../lib/GenXCodeGen/GenXPostLegalization.cpp  |  171 +
 .../lib/GenXCodeGen/GenXPressureTracker.cpp   |  211 +
 .../lib/GenXCodeGen/GenXPressureTracker.h     |   91 +
 .../lib/GenXCodeGen/GenXPrinter.cpp           |  243 +
 .../lib/GenXCodeGen/GenXPromoteArray.cpp      | 1081 +++
 .../lib/GenXCodeGen/GenXPromotePredicate.cpp  |  204 +
 .../lib/GenXCodeGen/GenXRawSendRipper.cpp     |   96 +
 .../lib/GenXCodeGen/GenXReduceIntSize.cpp     | 1038 +++
 .../lib/GenXCodeGen/GenXRegion.cpp            |  954 +++
 .../lib/GenXCodeGen/GenXRegion.h              |  197 +
 .../lib/GenXCodeGen/GenXRegionCollapsing.cpp  | 1460 +++++
 .../lib/GenXCodeGen/GenXRematerialization.cpp |  146 +
 .../lib/GenXCodeGen/GenXSimdCFConformance.cpp | 3698 +++++++++++
 .../lib/GenXCodeGen/GenXSubtarget.cpp         |  145 +
 .../lib/GenXCodeGen/GenXSubtarget.h           |  293 +
 .../lib/GenXCodeGen/GenXTargetMachine.cpp     |  546 ++
 .../lib/GenXCodeGen/GenXTargetMachine.h       |  183 +
 .../GenXCodeGen/GenXThreadPrivateMemory.cpp   | 1023 +++
 .../lib/GenXCodeGen/GenXTidyControlFlow.cpp   |  302 +
 .../lib/GenXCodeGen/GenXUnbaling.cpp          | 1204 ++++
 .../lib/GenXCodeGen/GenXUtil.cpp              | 1446 +++++
 IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h |  429 ++
 .../lib/GenXCodeGen/GenXVectorDecomposer.cpp  | 1177 ++++
 .../lib/GenXCodeGen/GenXVectorDecomposer.h    |  175 +
 IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h |  140 +
 .../lib/GenXCodeGen/GenXVisaRegAlloc.cpp      |  698 ++
 .../lib/GenXCodeGen/GenXVisaRegAlloc.h        |  253 +
 .../lib/GenXCodeGen/GenXWATable.cpp           |   34 +
 .../lib/GenXCodeGen/GenXWATable.h             |   57 +
 .../lib/GenXCodeGen/GenXWrapper.cpp           |  717 ++
 .../lib/GenXCodeGen/IgnoreRAUWValueMap.h      |   42 +
 .../lib/GenXCodeGen/IsaDescription.h          |  254 +
 .../lib/GenXCodeGen/KillAnalysis.cpp          |  188 +
 .../lib/GenXCodeGen/KillAnalysis.h            |   51 +
 .../lib/GenXCodeGen/TargetInfo/CMakeLists.txt |    5 +
 .../GenXCodeGen/TargetInfo/GenXTargetInfo.cpp |   50 +
 .../GenXCodeGen/TargetInfo/GenXTargetInfo.h   |   39 +
 .../lib/GenXCodeGen/Utils/CMakeLists.txt      |   23 +
 .../Utils/cisa_gen_intrinsics.json            | 3674 +++++++++++
 .../GenXCodeGen/Utils/cisa_gen_intrinsics.py  |  230 +
 .../CMAnalysis/ConstantFoldingGenX.cpp        |  285 +
 .../CMAnalysis/InstructionSimplifyGenX.cpp    |  269 +
 .../GenXOpts/CMPacketize/GenXPacketize.cpp    | 1757 +++++
 .../GenXOpts/CMPacketize/PacketBuilder.cpp    |  209 +
 .../lib/GenXOpts/CMPacketize/PacketBuilder.h  |  340 +
 .../CMPacketize/PacketBuilder_math.cpp        |  163 +
 .../CMPacketize/PacketBuilder_mem.cpp         |  172 +
 .../CMPacketize/PacketBuilder_misc.cpp        |  503 ++
 .../lib/GenXOpts/CMPacketize/README.md        |    1 +
 .../lib/GenXOpts/CMPacketize/WIAnalysis.cpp   |  900 +++
 .../lib/GenXOpts/CMPacketize/WIAnalysis.hpp   |  265 +
 .../lib/GenXOpts/CMPacketize/gen_builder.hpp  | 1035 +++
 .../CMPacketize/gen_builder_intrin.hpp        |  172 +
 .../GenXOpts/CMPacketize/gen_builder_meta.hpp |  244 +
 .../lib/GenXOpts/CMTrans/CMABI.cpp            | 1942 ++++++
 .../lib/GenXOpts/CMTrans/CMImpParam.cpp       |  701 ++
 .../GenXOpts/CMTrans/CMKernelArgOffset.cpp    |  621 ++
 .../lib/GenXOpts/CMTrans/CMRegion.cpp         |  925 +++
 .../lib/GenXOpts/CMakeLists.txt               |   20 +
 IGC/VectorCompiler/lib/Support/CMakeLists.txt |   11 +
 IGC/VectorCompiler/lib/Support/Options.cpp    |   62 +
 IGC/VectorCompiler/lib/Support/Status.cpp     |  150 +
 ...ddress-spaces-for-VectorCompute-glob.patch |   40 +
 ...ncParamKindINTEL-and-DecorationFuncP.patch |  198 +
 .../0003-Add-SPIRVDLL-and-VCExport.patch      |  216 +
 ...Lib-from-targets-Rename-tool-llvm-sp.patch |  107 +
 IGC/VectorCompiler/tests/vctest_config.yml    |    2 +
 IGC/VectorCompiler/unittests/CMakeLists.txt   |    9 +
 .../unittests/Regions/CMakeLists.txt          |   17 +
 .../unittests/Regions/OverlapTest.cpp         |   81 +
 .../unittests/SPIRVConversions/CMakeLists.txt |   16 +
 .../SPIRVConversions/SPIRVConversionsTest.cpp |  255 +
 IGC/common/igc_flags.def                      |    2 +
 147 files changed, 74118 insertions(+)
 create mode 100644 IGC/VectorCompiler/.gitignore
 create mode 100644 IGC/VectorCompiler/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/cmake/spirv.cmake
 create mode 100644 IGC/VectorCompiler/include/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/include/vc/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/include/vc/GenXCodeGen/GenXTarget.h
 create mode 100644 IGC/VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h
 create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/GenXAnalysis.h
 create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/GenXOpts.h
 create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/Utils/CMRegion.h
 create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/Utils/GenXSTLExtras.h
 create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/Utils/KernelInfo.h
 create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/Utils/RegCategory.h
 create mode 100644 IGC/VectorCompiler/include/vc/Support/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/include/vc/Support/Options.h
 create mode 100644 IGC/VectorCompiler/include/vc/Support/Options.td
 create mode 100644 IGC/VectorCompiler/include/vc/Support/Status.h
 create mode 100644 IGC/VectorCompiler/include/vc/Support/StatusCode.h
 create mode 100644 IGC/VectorCompiler/include/vc/Support/StatusTraits.h
 create mode 100644 IGC/VectorCompiler/lib/BackendPlugin/BackendPlugin.cpp
 create mode 100644 IGC/VectorCompiler/lib/BackendPlugin/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/lib/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenX.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenX.td
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAddressCommoning.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAggregatePseudoLowering.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAnalysisDumper.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXArgIndirection.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXCFSimplification.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXCategory.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXCisaBuilder.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXCoalescing.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXDeadVectorRemoval.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXDepressurizer.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXEmulate.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXExtractVectorizer.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXFuncPtrsLowering.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXGEPLowering.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXIMadPostLegalization.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXInlineAsmLowering.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXInstCombineCleanup.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLayoutBlocks.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveRanges.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLowering.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLInfoExtractor.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPatternMatch.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPrinter.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPromoteArray.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPromotePredicate.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRawSendRipper.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXReduceIntSize.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRegionCollapsing.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRematerialization.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXSimdCFConformance.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXThreadPrivateMemory.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXTidyControlFlow.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXUnbaling.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXWrapper.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/IgnoreRAUWValueMap.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/IsaDescription.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.h
 create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/Utils/CMakeLists.txt
 create mode 100755 IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.json
 create mode 100755 IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.py
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/ConstantFoldingGenX.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/InstructionSimplifyGenX.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/GenXPacketize.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.h
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_math.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_mem.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_misc.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/README.md
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.hpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder.hpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_intrin.hpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_meta.hpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMABI.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMImpParam.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMKernelArgOffset.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMRegion.cpp
 create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/lib/Support/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/lib/Support/Options.cpp
 create mode 100644 IGC/VectorCompiler/lib/Support/Status.cpp
 create mode 100644 IGC/VectorCompiler/spirv-patches-new/0001-Add-common-OCL-address-spaces-for-VectorCompute-glob.patch
 create mode 100644 IGC/VectorCompiler/spirv-patches-new/0002-Add-DecorationFuncParamKindINTEL-and-DecorationFuncP.patch
 create mode 100644 IGC/VectorCompiler/spirv-patches-new/0003-Add-SPIRVDLL-and-VCExport.patch
 create mode 100644 IGC/VectorCompiler/spirv-patches-new/0004-Remove-LLVMSPIRVLib-from-targets-Rename-tool-llvm-sp.patch
 create mode 100644 IGC/VectorCompiler/tests/vctest_config.yml
 create mode 100644 IGC/VectorCompiler/unittests/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/unittests/Regions/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/unittests/Regions/OverlapTest.cpp
 create mode 100644 IGC/VectorCompiler/unittests/SPIRVConversions/CMakeLists.txt
 create mode 100644 IGC/VectorCompiler/unittests/SPIRVConversions/SPIRVConversionsTest.cpp

diff --git a/IGC/AdaptorOCL/cmc.cpp b/IGC/AdaptorOCL/cmc.cpp
index 225357fdacec..1df4d3f7e34b 100644
--- a/IGC/AdaptorOCL/cmc.cpp
+++ b/IGC/AdaptorOCL/cmc.cpp
@@ -611,3 +611,147 @@ int cmc::vISACompile_v2(cmc_compile_info_v2* output, iOpenCL::CGen8CMProgram& CM
     CMProgram.CreateKernelBinaries();
     return status;
 }
+
+static void getCmcArg(cmc_arg_info& CmcArg, const vc::ocl::ArgInfo& Arg)
+{
+    switch (Arg.Kind)
+    {
+    case vc::ocl::ArgKind::General:
+        CmcArg.kind = cmc_arg_kind::General;
+        break;
+    case vc::ocl::ArgKind::LocalSize:
+        CmcArg.kind = cmc_arg_kind::LocalSize;
+        break;
+    case vc::ocl::ArgKind::GroupCount:
+        CmcArg.kind = cmc_arg_kind::GroupCount;
+        break;
+    case vc::ocl::ArgKind::Buffer:
+        CmcArg.kind = cmc_arg_kind::Buffer;
+        break;
+    case vc::ocl::ArgKind::SVM:
+        CmcArg.kind = cmc_arg_kind::SVM;
+        break;
+    case vc::ocl::ArgKind::Sampler:
+        CmcArg.kind = cmc_arg_kind::Sampler;
+        break;
+    case vc::ocl::ArgKind::Image1d:
+        CmcArg.kind = cmc_arg_kind::Image1d;
+        break;
+    case vc::ocl::ArgKind::Image2d:
+        CmcArg.kind = cmc_arg_kind::Image2d;
+        break;
+    case vc::ocl::ArgKind::Image3d:
+        CmcArg.kind = cmc_arg_kind::Image3d;
+        break;
+    case vc::ocl::ArgKind::PrintBuffer:
+        CmcArg.kind = cmc_arg_kind::PrintBuffer;
+        break;
+    case vc::ocl::ArgKind::PrivateBase:
+        CmcArg.kind = cmc_arg_kind::PrivateBase;
+        break;
+    }
+
+    switch (Arg.AccessKind)
+    {
+    case vc::ocl::ArgAccessKind::None:
+        CmcArg.access = cmc_access_kind::undef;
+        break;
+    case vc::ocl::ArgAccessKind::ReadOnly:
+        CmcArg.access = cmc_access_kind::read_only;
+        break;
+    case vc::ocl::ArgAccessKind::WriteOnly:
+        CmcArg.access = cmc_access_kind::write_only;
+        break;
+    case vc::ocl::ArgAccessKind::ReadWrite:
+        CmcArg.access = cmc_access_kind::read_write;
+        break;
+    }
+
+    CmcArg.index = Arg.Index;
+    CmcArg.offset = Arg.Offset;
+    CmcArg.sizeInBytes = Arg.SizeInBytes;
+    CmcArg.BTI = Arg.BTI;
+}
+
+// Returns vector of cmc_arg_info with all fields initialized.
+static std::vector<cmc_arg_info> getCmcArgInfos(const std::vector<vc::ocl::ArgInfo>& Args)
+{
+    std::vector<cmc_arg_info> CmcArgs{Args.size()};
+    for (unsigned i = 0, e = Args.size(); i != e; ++i)
+        getCmcArg(CmcArgs[i], Args[i]);
+    return CmcArgs;
+}
+
+static std::vector<cmc_ocl_print_string> getCmcPrintStrings(
+    const std::vector<std::string>& Original)
+{
+    std::vector<cmc_ocl_print_string> Converted;
+    std::transform(Original.begin(), Original.end(), std::back_inserter(Converted),
+        [](const std::string &str) {
+            IGC_ASSERT_MESSAGE(str.size() < cmc_ocl_print_string::max_width, "illegal string length");
+            cmc_ocl_print_string Tmp;
+            strcpy_s(Tmp.s, cmc_ocl_print_string::max_width, str.c_str());
+            return Tmp;
+        });
+    return Converted;
+}
+
+struct CmcContext
+{
+    std::vector<cmc_arg_info> Args;
+    std::vector<cmc_ocl_print_string> PrintStrings;
+};
+
+// Fills non-owning cmc_kernel_info with all fields initialized.
+static void getCmcKernelInfo(
+    cmc_kernel_info_v2& CmcInfo,
+    const vc::ocl::KernelInfo& Info,
+    const FINALIZER_INFO& JitInfo,
+    CmcContext& CmcCtx)
+{
+    IGC_ASSERT_MESSAGE(CmcCtx.PrintStrings.size() == Info.PrintStrings.size(), "inconsistent arguments");
+    CmcInfo.name = Info.Name.c_str();
+    CmcInfo.num_args = CmcCtx.Args.size();
+    CmcInfo.arg_descs = CmcCtx.Args.data();
+    CmcInfo.HasLocalIDx = true;
+    CmcInfo.HasLocalIDy = true;
+    CmcInfo.HasLocalIDz = true;
+    CmcInfo.HasGroupID = Info.HasGroupID;
+    CmcInfo.CompiledSIMDSize = 1;
+    CmcInfo.SLMSize = Info.SLMSize;
+    CmcInfo.NumGRFRequired = JitInfo.numGRFTotal;
+    CmcInfo.GRFByteSize = Info.GRFSizeInBytes;
+    CmcInfo.HasBarriers = Info.HasBarriers;
+    CmcInfo.StatelessPrivateMemSize = Info.StatelessPrivateMemSize;
+    CmcInfo.HasReadWriteImages = Info.HasReadWriteImages;
+    CmcInfo.num_print_strings = CmcCtx.PrintStrings.size();
+    CmcInfo.print_string_descs = CmcCtx.PrintStrings.data();
+    // std::copy requires either reinteprets or implementation of operator= in
+    // TableInfos from independent headers so memcpy seems to be the best option
+    // for now
+    memcpy_s(&CmcInfo.RelocationTable, sizeof(Info.RelocationTable), &Info.RelocationTable,
+           sizeof(Info.RelocationTable));
+    memcpy_s(&CmcInfo.SymbolTable, sizeof(Info.SymbolTable), &Info.SymbolTable,
+           sizeof(Info.SymbolTable));
+}
+
+void vc::createBinary(
+    iOpenCL::CGen8CMProgram& CMProgram,
+    const std::vector<vc::ocl::CompileInfo>& CompileInfos)
+{
+    cmc_kernel_info_v2 CmcInfo;
+    CmcContext CmcCtx;
+    for (const vc::ocl::CompileInfo& Info : CompileInfos)
+    {
+        CmcCtx.Args = getCmcArgInfos(Info.KernelInfo.Args);
+        CmcCtx.PrintStrings = getCmcPrintStrings(Info.KernelInfo.PrintStrings);
+        getCmcKernelInfo(CmcInfo, Info.KernelInfo, Info.JitInfo, CmcCtx);
+        CMKernel* K = new CMKernel(CMProgram.getPlatform());
+        CMProgram.m_kernels.push_back(K);
+        llvm::ArrayRef<uint8_t> GenBin{
+            reinterpret_cast<const uint8_t*>(Info.GenBinary.data()),
+            Info.GenBinary.size()};
+        populateKernelInfo_v2(&CmcInfo, Info.JitInfo, GenBin, *K);
+    }
+    CMProgram.CreateKernelBinaries();
+}
diff --git a/IGC/AdaptorOCL/cmc.h b/IGC/AdaptorOCL/cmc.h
index f369e381dc46..1b23741131f0 100644
--- a/IGC/AdaptorOCL/cmc.h
+++ b/IGC/AdaptorOCL/cmc.h
@@ -35,6 +35,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "igcmc.h"
 #include "Compiler/CodeGenPublic.h"
+#include "common/LLVMWarningsPush.hpp"
+#include "VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h"
+#include "common/LLVMWarningsPop.hpp"
 
 namespace iOpenCL {
   class CGen8CMProgram;
@@ -111,3 +114,8 @@ extern int vISACompile_v2(cmc_compile_info_v2 *output,
 extern const char* getPlatformStr(PLATFORM platform);
 
 } // namespace cmc
+
+namespace vc {
+void createBinary(iOpenCL::CGen8CMProgram &CMProgram,
+                  const std::vector<vc::ocl::CompileInfo> &CompileInfos);
+} // namespace vc
diff --git a/IGC/AdaptorOCL/dllInterfaceCompute.cpp b/IGC/AdaptorOCL/dllInterfaceCompute.cpp
index b42eb24f034d..a874c2aa0031 100644
--- a/IGC/AdaptorOCL/dllInterfaceCompute.cpp
+++ b/IGC/AdaptorOCL/dllInterfaceCompute.cpp
@@ -55,6 +55,11 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "AdaptorOCL/OCL/sp/gtpin_igc_ocl.h"
 #include "AdaptorOCL/igcmc.h"
 #include "AdaptorOCL/cmc.h"
+#include "common/LLVMWarningsPush.hpp"
+#include <llvm/ADT/ScopeExit.h>
+#include "VectorCompiler/include/vc/Support/StatusCode.h"
+#include "VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h"
+#include "common/LLVMWarningsPop.hpp"
 
 #include <iStdLib/MemCopy.h>
 
@@ -820,6 +825,14 @@ static bool TranslateBuildCM(const STB_TranslateInputArgs* pInputArgs,
     const IGC::CPlatform& IGCPlatform,
     float profilingTimerResolution);
 
+#if !defined(WDDM_LINUX)
+static std::error_code TranslateBuildVC(
+    const STB_TranslateInputArgs* pInputArgs,
+    STB_TranslateOutputArgs* pOutputArgs, TB_DATA_FORMAT inputDataFormatTemp,
+    const IGC::CPlatform& IGCPlatform, float profilingTimerResolution);
+#endif //  !defined(WDDM_LINUX)
+
+
 bool TranslateBuild(
     const STB_TranslateInputArgs* pInputArgs,
     STB_TranslateOutputArgs* pOutputArgs,
@@ -828,6 +841,16 @@ bool TranslateBuild(
     float profilingTimerResolution)
 {
     if (pInputArgs->pOptions) {
+#if !defined(WDDM_LINUX)
+        std::error_code Status =
+            TranslateBuildVC(pInputArgs, pOutputArgs, inputDataFormatTemp,
+                             IGCPlatform, profilingTimerResolution);
+        if (!Status)
+            return true;
+        // If vc codegen option was not specified, then vc was not called.
+        if (static_cast<vc::errc>(Status.value()) != vc::errc::not_vc_codegen)
+            return false;
+#endif // !defined(WDDM_LINUX)
         static const char* CMC = "-cmc";
         if (strstr(pInputArgs->pOptions, CMC) != nullptr)
             return TranslateBuildCM(pInputArgs,
@@ -1438,4 +1461,131 @@ static bool TranslateBuildCM(const STB_TranslateInputArgs* pInputArgs,
     return false;
 }
 
+#if !defined(WDDM_LINUX)
+
+static void adjustPlatformVC(const IGC::CPlatform& IGCPlatform,
+                             vc::CompileOptions& Opts)
+{
+    Opts.CPUStr = cmc::getPlatformStr(IGCPlatform.getPlatformInfo());
+    Opts.WATable = std::make_unique<WA_TABLE>(IGCPlatform.getWATable());
+}
+
+static void adjustFileTypeVC(TB_DATA_FORMAT DataFormat,
+                             vc::CompileOptions& Opts)
+{
+    switch (DataFormat)
+    {
+    case TB_DATA_FORMAT::TB_DATA_FORMAT_SPIR_V:
+        Opts.FType = vc::FileType::SPIRV;
+        return;
+    default:
+        llvm_unreachable("Data format is not supported yet");
+    }
+}
+
+static void adjustOptLevelVC(vc::CompileOptions& Opts)
+{
+    if (IGC_IS_FLAG_ENABLED(VCOptimizeNone))
+        Opts.OptLevel = vc::OptimizerLevel::None;
+}
+
+static void adjustOptionsVC(const IGC::CPlatform& IGCPlatform,
+                            TB_DATA_FORMAT DataFormat, vc::CompileOptions& Opts)
+{
+    adjustPlatformVC(IGCPlatform, Opts);
+    adjustFileTypeVC(DataFormat, Opts);
+    adjustOptLevelVC(Opts);
+}
+
+static std::error_code getErrorVC(llvm::Error Err,
+                                  STB_TranslateOutputArgs* pOutputArgs)
+{
+    std::error_code Status;
+    llvm::handleAllErrors(
+        std::move(Err), [&Status, pOutputArgs](const llvm::ErrorInfoBase& EI) {
+            Status = EI.convertToErrorCode();
+            // Some tests check for build log when everything is ok.
+            // So let's not even try to touch things if we were not called.
+            if (static_cast<vc::errc>(Status.value()) == vc::errc::not_vc_codegen)
+              return;
+            SetErrorMessage(EI.message(), *pOutputArgs);
+        });
+    return Status;
+}
+
+static void outputBinaryVC(llvm::StringRef Binary,
+                           STB_TranslateOutputArgs* pOutputArgs)
+{
+    size_t BinarySize = static_cast<size_t>(Binary.size());
+    char* pBinaryOutput = new char[BinarySize];
+    memcpy_s(pBinaryOutput, BinarySize, Binary.data(), BinarySize);
+    pOutputArgs->OutputSize = static_cast<uint32_t>(BinarySize);
+    pOutputArgs->pOutput = pBinaryOutput;
+}
+
+static std::error_code TranslateBuildVC(
+    const STB_TranslateInputArgs* pInputArgs,
+    STB_TranslateOutputArgs* pOutputArgs, TB_DATA_FORMAT inputDataFormatTemp,
+    const IGC::CPlatform& IGCPlatform, float profilingTimerResolution)
+{
+#if IGC_VC_DISABLED
+    SetErrorMessage("IGC VC explicitly disabled in build", *pOutputArgs);
+    return false;
+#else
+
+    llvm::StringRef ApiOptions{pInputArgs->pOptions, pInputArgs->OptionsSize};
+    llvm::StringRef InternalOptions{pInputArgs->pInternalOptions,
+                                    pInputArgs->InternalOptionsSize};
+    auto pInput = pInputArgs->pInput;
+    size_t InputSize = pInputArgs->InputSize;
+
+
+    auto ExpOptions = vc::ParseOptions(ApiOptions, InternalOptions);
+    if (!ExpOptions)
+        return getErrorVC(ExpOptions.takeError(), pOutputArgs);
+
+    // Reset options when everything is done here.
+    // This is needed to not interfere with subsequent translations.
+    const auto ClOptGuard =
+        llvm::make_scope_exit([]() { llvm::cl::ResetAllOptionOccurrences(); });
+
+    vc::CompileOptions& Opts = ExpOptions.get();
+    adjustOptionsVC(IGCPlatform, inputDataFormatTemp, Opts);
+
+    llvm::ArrayRef<char> Input{pInput, InputSize};
+    auto ExpOutput = vc::Compile(Input, Opts);
+    if (!ExpOutput)
+        return getErrorVC(ExpOutput.takeError(), pOutputArgs);
+    vc::CompileOutput& Res = ExpOutput.get();
+
+    auto Visitor = [&IGCPlatform, pOutputArgs](auto&& CompileResult) {
+        using Ty = std::decay_t<decltype(CompileResult)>;
+        if constexpr (std::is_same_v<Ty, vc::cm::CompileOutput>)
+        {
+            outputBinaryVC(CompileResult.IsaBinary, pOutputArgs);
+        }
+        else if constexpr (std::is_same_v<Ty, vc::ocl::CompileOutput>)
+        {
+            iOpenCL::CGen8CMProgram CMProgram{IGCPlatform.getPlatformInfo()};
+            vc::createBinary(CMProgram, CompileResult.Kernels);
+            Util::BinaryStream ProgramBinary;
+            CMProgram.GetProgramBinary(ProgramBinary,
+                                       CompileResult.PointerSizeInBytes);
+            llvm::StringRef BinaryRef(ProgramBinary.GetLinearPointer(),
+                                      ProgramBinary.Size());
+            outputBinaryVC(BinaryRef, pOutputArgs);
+        }
+        else
+        {
+            static_assert(!sizeof(Ty), "One of compile output is not visited");
+        }
+    };
+
+    std::visit(Visitor, Res);
+
+    return {};
+#endif
+}
+#endif // !defined(WDDM_LINUX)
+
 } // namespace TC
diff --git a/IGC/CMakeLists.txt b/IGC/CMakeLists.txt
index 80c793788fa5..4331acc17cb8 100644
--- a/IGC/CMakeLists.txt
+++ b/IGC/CMakeLists.txt
@@ -2196,6 +2196,14 @@ set(IGC_BUILD__PROJ_NAME_PREFIX "")
   set(IGC_BUILD__SPIRV_ENABLED ON)
 
 
+# Enable vector compiler for Linux and Windows
+# If user already defined this, honor decision
+if(NOT DEFINED IGC_BUILD__VC_ENABLED)
+  if(LLVM_ON_UNIX OR LLVM_ON_WIN32)
+    set(IGC_BUILD__VC_ENABLED ON)
+  endif()
+endif()
+
 
 # ======================================== Path helper variables =======================================
 
@@ -3078,6 +3086,12 @@ if(IGC_BUILD__SPIRV_ENABLED)
     )
 endif()
 
+#VC OPT switch on/off
+if(NOT IGC_BUILD__VC_ENABLED)
+  set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS
+      IGC_VC_DISABLED
+    )
+endif()
   set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS
       _SCL_SECURE_NO_WARNINGS
       _CRT_SECURE_NO_WARNINGS
@@ -3446,6 +3460,10 @@ if(LLVM_ON_WIN32
 endif()
 
 
+if(IGC_BUILD__VC_ENABLED AND NOT CMAKE_WDDM_LINUX)
+  add_subdirectory(VectorCompiler)
+endif()
+
 add_subdirectory(Compiler)
 add_subdirectory(DriverInterface)
 igc_sg_define(IGC__DriverInterface)
@@ -3737,6 +3755,12 @@ list(APPEND _targetLinkLineCommon zebinlib)
       )
   endif()
 
+  if(IGC_BUILD__VC_ENABLED)
+    list(APPEND _targetLinkLineCommon
+      ${IGC_BUILD__PROJ_VC_LIBS_TO_LINK}
+    )
+  endif()
+
   list(APPEND _targetLinkLineCommon
       "${IGC_BUILD__START_GROUP}"
       ${IGC_BUILD__LLVM_LIBS_TO_LINK}
diff --git a/IGC/VectorCompiler/.gitignore b/IGC/VectorCompiler/.gitignore
new file mode 100644
index 000000000000..e4ac6dd803fc
--- /dev/null
+++ b/IGC/VectorCompiler/.gitignore
@@ -0,0 +1,60 @@
+#==============================================================================#
+# This file specifies intentionally untracked files that git should ignore.
+# See: http://www.kernel.org/pub/software/scm/git/docs/gitignore.html
+#
+# This file is intentionally different from the output of `git svn show-ignore`,
+# as most of those are useless.
+#==============================================================================#
+
+#==============================================================================#
+# File extensions to be ignored anywhere in the tree.
+#==============================================================================#
+# Temp files created by most text editors.
+*~
+# Merge files created by git.
+*.orig
+# Byte compiled python modules.
+*.pyc
+# vim swap files
+.*.sw?
+.sw?
+#OS X specific files.
+.DS_store
+
+# Nested build directory
+/build
+
+#==============================================================================#
+# Explicit files to ignore (only matches one).
+#==============================================================================#
+# Various tag programs
+/tags
+/TAGS
+/GPATH
+/GRTAGS
+/GSYMS
+/GTAGS
+.gitusers
+autom4te.cache
+cscope.files
+cscope.out
+autoconf/aclocal.m4
+autoconf/autom4te.cache
+/compile_commands.json
+tags
+# Visual Studio built-in CMake configuration
+/CMakeSettings.json
+# CLion project configuration
+/.idea
+
+#==============================================================================#
+# Directories to ignore (do not add trailing '/'s, they skip symlinks).
+#==============================================================================#
+# Sphinx build tree, if building in-source dir.
+docs/_build
+docs/autogenerated
+# VS2017 and VSCode config files.
+.vscode
+.vs
+# clangd index
+.clangd
diff --git a/IGC/VectorCompiler/CMakeLists.txt b/IGC/VectorCompiler/CMakeLists.txt
new file mode 100644
index 000000000000..44ae6a41fb4a
--- /dev/null
+++ b/IGC/VectorCompiler/CMakeLists.txt
@@ -0,0 +1,158 @@
+#===================== begin_copyright_notice ==================================
+
+#Copyright (c) 2017 Intel Corporation
+
+#Permission is hereby granted, free of charge, to any person obtaining a
+#copy of this software and associated documentation files (the
+#"Software"), to deal in the Software without restriction, including
+#without limitation the rights to use, copy, modify, merge, publish,
+#distribute, sublicense, and/or sell copies of the Software, and to
+#permit persons to whom the Software is furnished to do so, subject to
+#the following conditions:
+
+#The above copyright notice and this permission notice shall be included
+#in all copies or substantial portions of the Software.
+
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+#OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+#MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+#IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+#CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+#TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+#======================= end_copyright_notice ==================================
+
+
+
+# CMake Settings:
+# - SPIRV_PREBUILD_DIR
+# - SPIRV_SRC
+# - INSTALL_SPIRVDLL
+# - VC_INTRINSICS_SRC
+set(IGC_BUILD__PROJ__VectorCompiler "${IGC_BUILD__PROJ_NAME_PREFIX}VectorCompiler")
+set(IGC_BUILD__PROJ__VectorCompiler "${IGC_BUILD__PROJ__VectorCompiler}" PARENT_SCOPE)
+set(IGC_BUILD__PROJ_VC_LIBS_TO_LINK VCCodeGen PARENT_SCOPE)
+
+set(IGC_BUILD__PROJ_LABEL__VectorCompiler "${IGC_BUILD__PROJ__VectorCompiler}")
+
+message(STATUS "+++ Source/IGC/VectorCompiler +++")
+message(STATUS "[VC] Build proj: ${IGC_BUILD__PROJ__VectorCompiler}")
+
+
+igc_arch_get_cpu(_cpuSuffix)
+
+set(BUILD_EXTERNAL YES)
+
+# --- LLVM ---
+if(IGC_OPTION__FORCE_SYSTEM_LLVM OR (WIN32 AND LLVM_USE_PREBUILT))
+  message(STATUS "[VC] Using system llvm")
+
+  # Need to search for llvm-tblgen
+  find_program(LLVM_TABLEGEN_EXE "llvm-tblgen"
+    ${LLVM_TOOLS_BINARY_DIR}
+    NO_DEFAULT_PATH
+    )
+  if(LLVM_TABLEGEN_EXE-NOTFOUND)
+    message(FATAL_ERROR "[VC] llvm-tblgen is not found")
+  endif()
+  message(STATUS "[VC] Found tblgen: ${LLVM_TABLEGEN_EXE}")
+
+  # find_package was called by igc cmake, no need to do it again.
+else()
+  # Prebuilt llvm does not have tblgen...
+  if(LLVM_USE_PREBUILT)
+    message(FATAL_ERROR "[VC] vector compiler with prebuilt llvm is not supported")
+  endif()
+
+  # In last scenario we are building with llvm so every target is defined
+  # and LLVMConfig will only set needed variables.
+  message(STATUS "[VC] Using llvm source build")
+  set(LLVM_BUILD_DIR "${LLVM_SOURCE_DIR}/../build/src")
+  set(LLVM_CMAKE_DIR "${LLVM_BUILD_DIR}/lib/cmake/llvm")
+  message(STATUS "[VC] LLVMConfig.cmake location: ${LLVM_CMAKE_DIR}")
+  find_package(LLVM REQUIRED
+    HINTS ${LLVM_CMAKE_DIR}
+    NO_DEFAULT_PATH
+    )
+
+  # We have executable target, use it.
+  set(LLVM_TABLEGEN_EXE "llvm-tblgen")
+  message(STATUS "[VC] Using executable target llvm-tlbgen for tablegenning")
+  # IGC has its own special cmake for external llvm.
+  # It sets LLVM_INCLUDE_DIRS instead of LLVM_INCLUDE_DIR.
+  set(LLVM_INCLUDE_DIR ${LLVM_INCLUDE_DIRS})
+endif()
+
+# Now find_package was called in all cases and we have all needed variables.
+set(CMAKE_MODULE_PATH
+  ${LLVM_CMAKE_DIR}
+  ${CMAKE_MODULE_PATH}
+  )
+
+cmake_policy(SET CMP0057 NEW)
+# cm offline compiler requires -rdynamic flag to be absent
+cmake_policy(SET CMP0065 NEW)
+
+include(AddLLVM)
+
+set(LLVM_MAIN_INCLUDE_DIR ${LLVM_INCLUDE_DIR})
+include(TableGen)
+# Set LLVM_TABLEGEN_FLAGS manually based on include dirs.
+list(TRANSFORM LLVM_INCLUDE_DIR PREPEND "-I=" OUTPUT_VARIABLE LLVM_TABLEGEN_FLAGS)
+
+message(STATUS "[VC] Including llvm headers: ${LLVM_INCLUDE_DIR}")
+include_directories(${LLVM_INCLUDE_DIR})
+
+# --- VISA ---
+
+# HACK. We should use only visa/include without visa internal headers.
+set(VISA_INCLUDE_DIRS ${IGC_BUILD__VISA_DIR})
+
+# --- VC Intrinsics ---
+
+add_compile_definitions(LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR})
+
+if(LLVM_ON_WIN32)
+  add_compile_options(/experimental:external)
+  foreach(INCDIR ${LLVM_INCLUDE_DIRS})
+    add_compile_options("SHELL:/external:I ${INCDIR}")
+  endforeach()
+  add_compile_options(/external:W0)
+
+  # disable 32/64 warnings
+  add_compile_options(/wd4244)
+
+  #  disable unary minus to unsigned type warning
+  add_compile_options(/wd4146)
+
+  # disable implicitly deleted dtor warning
+  add_compile_options(/wd4624)
+endif()
+
+if(DEFINED VC_INTRINSICS_SRC)
+  set(INTRSRC "${VC_INTRINSICS_SRC}/GenXIntrinsics")
+endif()
+
+if(NOT DEFINED INTRSRC)
+  set(INTRSRC "${CMAKE_CURRENT_SOURCE_DIR}/../../../vc-intrinsics/GenXIntrinsics")
+endif()
+
+message(STATUS "[VC] Using vc-intrinsics source from: ${INTRSRC}")
+
+# We are using prebuilt SPIRV and building intrinsics.
+set(INTRBUILD "${CMAKE_CURRENT_BINARY_DIR}/intrbuild")
+
+
+# Do not copy anything from prebuilts. libSPIRVDLL.so will be dynamically loaded at runtime.
+add_subdirectory(${INTRSRC} ${INTRBUILD})
+include_directories(${INTRSRC}/include ${INTRBUILD}/include)
+
+include(cmake/spirv.cmake)
+
+# --- VC Opt ---
+
+add_subdirectory(include)
+add_subdirectory(lib)
+
diff --git a/IGC/VectorCompiler/cmake/spirv.cmake b/IGC/VectorCompiler/cmake/spirv.cmake
new file mode 100644
index 000000000000..8ebe5c8a1c11
--- /dev/null
+++ b/IGC/VectorCompiler/cmake/spirv.cmake
@@ -0,0 +1,180 @@
+#
+# Creates `target_branch` starting at the `base_revision` in the `repo_dir`.
+# Then all patches from the `patches_dir` are committed to the `target_branch`.
+# Does nothing if the `target_branch` is already checked out in the `repo_dir`.
+#
+function(apply_patches repo_dir patches_dir base_revision target_branch)
+    file(GLOB patches ${patches_dir}/*.patch)
+    if(NOT patches)
+        message(STATUS "No patches in ${patches_dir}")
+        return()
+    endif()
+
+    if(NOT DEFINED GIT_EXECUTABLE)
+      find_program(GIT_EXECUTABLE git)
+    endif()
+
+    message(STATUS "[VC] ${repo_dir}:")
+    # Check if the target branch already exists
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-parse --verify --no-revs -q ${target_branch}
+        WORKING_DIRECTORY ${repo_dir}
+        RESULT_VARIABLE patches_needed
+    )
+    # Set up fake username just in case if we don't have one globally
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} config --local user.name "patcher"
+      WORKING_DIRECTORY ${repo_dir}
+    )
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} config --local user.email "patcher@intel.com"
+      WORKING_DIRECTORY ${repo_dir}
+    )
+    if(patches_needed) # The target branch doesn't exist
+        list(SORT patches)
+        execute_process( # Create the target branch
+            COMMAND ${GIT_EXECUTABLE} checkout -b ${target_branch} ${base_revision}
+            WORKING_DIRECTORY ${repo_dir}
+        )
+        execute_process( # Apply the pathces
+            COMMAND ${GIT_EXECUTABLE} am --3way --ignore-whitespace ${patches}
+            WORKING_DIRECTORY ${repo_dir}
+        )
+    else() # The target branch already exists
+        execute_process( # Check it out
+            COMMAND ${GIT_EXECUTABLE} checkout ${target_branch}
+            WORKING_DIRECTORY ${repo_dir}
+        )
+    endif()
+endfunction()
+
+# User may switch spirv dll installation off
+if(NOT DEFINED INSTALL_SPIRVDLL)
+  set(INSTALL_SPIRVDLL 1)
+endif()
+
+# Handle installation of SPIRVDLL.
+# Currently, release build of spirvdll is used to read spirv.
+# For debugging, one has to build debug version locally and replace release library.
+if(INSTALL_SPIRVDLL)
+if(NOT DEFINED SPIRV_PREBUILD_DIR AND NOT WIN32)
+include(ExternalProject)
+set(SPIRV_COPY "${CMAKE_CURRENT_BINARY_DIR}/llvm-spirv-vc")
+if(DEFINED SPIRV_SRC)
+  if(NOT EXISTS ${SPIRV_SRC})
+    message(FATAL_ERROR "[VC] Cannot find SPIRVDLL sources in ${SPIRV_SRC}")
+  endif()
+  set(SPIRV_SOURCES ${SPIRV_SRC})
+else()
+  set(SPIRV_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../llvm-project/llvm/projects/llvm-spirv")
+  if(NOT EXISTS ${SPIRV_SOURCES})
+    message(STATUS "[VC] Cannot find SPIRVDLL sources in ${SPIRV_SOURCES}")
+    set(SPIRV_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../llvm-spirv")
+  endif()
+  if(NOT EXISTS ${SPIRV_SOURCES})
+    message(FATAL_ERROR "[VC] Cannot find SPIRVDLL sources in ${SPIRV_SOURCES}")
+  endif()
+endif()
+
+set(SPIRV_REV_PATCH e87b59a77abb30d3b5fb0b3e0555a39acbe5ebb4)
+set(SPRIV_PATCHES ${CMAKE_CURRENT_SOURCE_DIR}/spirv-patches-new/)
+set(SPRIV_BRANCH_PATCH spirvdll_100)
+find_program(MAKE_EXEC NAMES make gmake)
+
+if(NOT EXISTS ${SPIRV_COPY})
+  message(STATUS "[VC] : Copying stock SPIRV-Translator sources to ${SPIRV_COPY}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E copy_directory ${SPIRV_SOURCES} ${SPIRV_COPY})
+endif()
+
+apply_patches(${SPIRV_COPY}
+${SPRIV_PATCHES}
+${SPIRV_REV_PATCH}
+${SPRIV_BRANCH_PATCH}
+)
+
+if(IGC_OPTION__FORCE_SYSTEM_LLVM)
+
+  ExternalProject_Add(SPIRVDLL_EX
+      PREFIX ${CMAKE_CURRENT_BINARY_DIR}/SPIRVDLL
+      SOURCE_DIR ${SPIRV_COPY}
+      CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}/spirv-install
+      BUILD_COMMAND ${MAKE_EXEC} SPIRVDLL
+      INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/spirv-install
+    )
+
+else()
+
+   ExternalProject_Add(SPIRVDLL_EX
+      PREFIX ${CMAKE_CURRENT_BINARY_DIR}/SPIRVDLL
+      SOURCE_DIR ${SPIRV_COPY}
+      CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}/spirv-install -DLLVM_DIR=${LLVM_DIR}
+      BUILD_COMMAND ${MAKE_EXEC} SPIRVDLL
+      INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/spirv-install
+    )
+
+    add_dependencies(SPIRVDLL_EX VCCodeGen)
+
+endif(IGC_OPTION__FORCE_SYSTEM_LLVM)
+
+install(FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/spirv-install/lib/libSPIRVDLL.so
+  DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}
+  COMPONENT igc-core
+)
+
+elseif(NOT TARGET SPIRVDLL)
+  if(DEFINED WIN32)
+    set(SPIRVDLL_NAME "SPIRVDLL.dll")
+  else()
+    set(SPIRVDLL_NAME "libSPIRVDLL.so")
+  endif()
+  if(DEFINED SPIRV_PREBUILD_DIR)
+    set(PREBUILT_SPIRVDLL_PATH "${SPIRV_PREBUILD_DIR}/lib" )
+  endif()
+  find_file(SPIRVDLL_LIB
+    ${SPIRVDLL_NAME}
+    PATHS ${PREBUILT_SPIRVDLL_PATH}
+    NO_DEFAULT_PATH
+    )
+  if(NOT SPIRVDLL_LIB)
+    message(FATAL_ERROR "[VC] Cannot find SPIRVDLL in prebuilds")
+  endif()
+  message(STATUS "[VC] Found SPIRVDLL: ${SPIRVDLL_LIB}")
+  if(WIN32)
+    if ("${vc_uses_custom_spirv}" STREQUAL "True")
+      set(INSTALL_SPRIRVDLL_NAME "SPIRVDLL.dll")
+      if("${_cpuSuffix}" STREQUAL "32")
+        set(INSTALL_SPRIRVDLL_NAME "SPIRVDLL32.dll")
+      endif()
+      install(FILES ${SPIRVDLL_LIB}
+        CONFIGURATIONS Debug Release
+        DESTINATION $<CONFIG>/lh64
+        RENAME ${INSTALL_SPRIRVDLL_NAME}
+      )
+      install(FILES ${SPIRVDLL_LIB}
+        CONFIGURATIONS ReleaseInternal
+        DESTINATION Release-Internal/lh64
+        RENAME ${INSTALL_SPRIRVDLL_NAME}
+      )
+    endif()
+  else()
+    install(FILES
+      ${SPIRVDLL_LIB}
+      DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}
+      COMPONENT igc-core
+      )
+  endif()
+else()
+  get_target_property(SPIRVDLL_IMPORTED SPIRVDLL IMPORTED)
+  if(SPIRVDLL_IMPORTED)
+    message(STATUS "[VC] SPIRVDLL is already imported")
+  else()
+    message(STATUS "[VC] SPIRVDLL will be built in-tree")
+    install(FILES
+      $<TARGET_FILE:SPIRVDLL>
+      DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}
+      COMPONENT igc-core
+    )
+  endif()
+endif()
+endif(INSTALL_SPIRVDLL)
diff --git a/IGC/VectorCompiler/include/CMakeLists.txt b/IGC/VectorCompiler/include/CMakeLists.txt
new file mode 100644
index 000000000000..d3d27fce5e6a
--- /dev/null
+++ b/IGC/VectorCompiler/include/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Special common target for headers that propagates
+# needed include directories and dependencies.
+add_library(VCHeaders INTERFACE)
+
+target_include_directories(VCHeaders
+  INTERFACE
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+add_dependencies(VCHeaders
+  intrinsics_gen
+  GenXIntrinsicsGen
+  VectorCompilerOptions
+  )
+
+# Additional things like header generators.
+add_subdirectory(vc)
diff --git a/IGC/VectorCompiler/include/vc/CMakeLists.txt b/IGC/VectorCompiler/include/vc/CMakeLists.txt
new file mode 100644
index 000000000000..fc23e64eeb7a
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Support)
diff --git a/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXTarget.h b/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXTarget.h
new file mode 100644
index 000000000000..9d0ada5bffa6
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXTarget.h
@@ -0,0 +1,42 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef LLVM_GENX_TARGET_INITIALIZERS_H
+#define LLVM_GENX_TARGET_INITIALIZERS_H
+
+extern "C" void LLVMInitializeGenXTargetInfo();
+extern "C" void LLVMInitializeGenXTarget();
+extern "C" void LLVMInitializeGenXTargetMC();
+
+namespace llvm {
+void initializeGenX() {
+  LLVMInitializeGenXTargetInfo();
+  LLVMInitializeGenXTarget();
+  LLVMInitializeGenXTargetMC();
+}
+} // namespace llvm
+
+#endif
diff --git a/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h b/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h
new file mode 100644
index 000000000000..85c6b7109efa
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h
@@ -0,0 +1,142 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#pragma once
+
+#include <JitterDataStruct.h>
+#include <inc/common/sku_wa.h>
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/Support/Error.h>
+
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+namespace vc {
+
+namespace ocl {
+
+enum class ArgKind {
+  General,
+  LocalSize,  // IMPLICIT_LOCAL_SIZE
+  GroupCount, // IMPLICIT_NUM_GROUPS
+  Buffer,     // 1D buffer
+  SVM,        // stateless global pointer
+  Sampler,
+  Image1d,
+  Image2d,
+  Image3d,
+  PrintBuffer,
+  PrivateBase
+};
+
+enum class ArgAccessKind { None, ReadOnly, WriteOnly, ReadWrite };
+
+struct ArgInfo {
+  ArgKind Kind;
+  ArgAccessKind AccessKind;
+  unsigned Index;
+  unsigned Offset;
+  unsigned SizeInBytes;
+  unsigned BTI;
+};
+
+struct TableInfo {
+  void *Buf = nullptr;
+  uint32_t Size = 0;
+  uint32_t NumEntries = 0;
+};
+
+// Mirror of cmc_kernel_info that owns its data.
+struct KernelInfo {
+  std::string Name;
+  std::vector<ArgInfo> Args;
+  std::vector<std::string> PrintStrings;
+  bool HasGroupID;
+  bool HasBarriers;
+  bool HasReadWriteImages;
+  unsigned SLMSize;
+  unsigned ThreadPrivateMemSize;
+  unsigned StatelessPrivateMemSize;
+  unsigned GRFSizeInBytes;
+
+  TableInfo RelocationTable;
+  TableInfo SymbolTable;
+};
+
+
+struct CompileInfo {
+  KernelInfo KernelInfo;
+  FINALIZER_INFO JitInfo;
+  std::string GenBinary;
+};
+
+struct CompileOutput {
+  std::vector<CompileInfo> Kernels;
+  unsigned PointerSizeInBytes;
+};
+
+} // namespace ocl
+
+namespace cm {
+struct CompileOutput {
+  std::string IsaBinary;
+};
+} // namespace cm
+
+using CompileOutput = std::variant<cm::CompileOutput, ocl::CompileOutput>;
+
+enum class FileType {
+  SPIRV, SOURCE
+};
+
+enum class OptimizerLevel { None, Full };
+
+enum class RuntimeKind { CM, OpenCL };
+
+struct CompileOptions {
+  FileType FType = FileType::SPIRV;
+  std::string CPUStr;
+  std::unique_ptr<WA_TABLE> WATable = nullptr;
+
+  // Api accessible options.
+  bool NoVecDecomp = false;
+  OptimizerLevel OptLevel = OptimizerLevel::Full;
+
+  // Internal options.
+  RuntimeKind Runtime = RuntimeKind::OpenCL;
+  bool DumpIsa = false;
+  bool DumpIR = false;
+};
+
+llvm::Expected<CompileOutput> Compile(llvm::ArrayRef<char> Input,
+                                      const CompileOptions &Opts);
+
+llvm::Expected<CompileOptions> ParseOptions(llvm::StringRef ApiOptions,
+                                            llvm::StringRef InternalOptions);
+} // namespace vc
diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/GenXAnalysis.h b/IGC/VectorCompiler/include/vc/GenXOpts/GenXAnalysis.h
new file mode 100644
index 000000000000..ab271556deda
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/GenXOpts/GenXAnalysis.h
@@ -0,0 +1,79 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file declares some hooks that are injected into llvm analysis library
+// files to make them work with genx related stuff.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_GENX_ANALYSIS_H
+#define LLVM_GENX_ANALYSIS_H
+
+namespace llvm {
+
+template <typename T> class ArrayRef;
+class CallInst;
+class Constant;
+class DataLayout;
+class Instruction;
+class ImmutableCallSite;
+class Type;
+class Use;
+class Value;
+
+/// canConstantFoldGenXIntrinsic - Return true if it is even possible to fold
+/// a call to the specified GenX intrinsic.
+bool canConstantFoldGenXIntrinsic(unsigned IID);
+
+/// ConstantFoldGenXIntrinsic - Attempt to constant fold a call to the
+/// specified GenX intrinsic with the specified arguments, returning null if
+/// unsuccessful.
+Constant *ConstantFoldGenXIntrinsic(unsigned IID, Type *RetTy,
+                                    ArrayRef<Constant *> Operands,
+                                    ImmutableCallSite CS, const DataLayout *DL);
+
+/// ConstantFoldGenX - Attempt to constant fold genx-related instruction (intrinsic).
+/// This function tries to fold operands and then tries to fold instruction
+/// itself. Returns nullptr if folding was unsuccessful.
+Constant *ConstantFoldGenX(Instruction *I, const DataLayout &DL);
+
+/// Given a GenX intrinsic and a set of arguments, see if we can fold the
+/// result.
+///
+/// If this call could not be simplified returns null.
+Value *SimplifyGenXIntrinsic(unsigned IID, Type *RetTy, Use *ArgBegin,
+                             Use *ArgEnd);
+
+/// Given a GenX related intruction, see if we can fold the
+/// result. This function tries simplification and then constant folding.
+///
+/// If this instruction could not be simplified returns null.
+Value *SimplifyGenX(CallInst *I);
+
+} // end namespace llvm
+
+#endif
diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/GenXOpts.h b/IGC/VectorCompiler/include/vc/GenXOpts/GenXOpts.h
new file mode 100644
index 000000000000..b67804c2f775
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/GenXOpts/GenXOpts.h
@@ -0,0 +1,74 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This header file defines prototypes for accessor functions that expose passes
+// in the GenX transformations library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_GENX_OPTS_H
+#define LLVM_GENX_OPTS_H
+
+namespace llvm {
+
+class FunctionPass;
+class ModulePass;
+class Pass;
+
+//===----------------------------------------------------------------------===//
+//
+// CMImpParam - Transforms to enable implicit parameters
+//
+Pass *createCMImpParamPass(bool);
+
+//===----------------------------------------------------------------------===//
+//
+// CMKernelArgOffset - Determine offset of each CM kernel argument
+//
+Pass *createCMKernelArgOffsetPass(unsigned GrfByteSize, bool OCLCodeGen);
+
+//===----------------------------------------------------------------------===//
+//
+// CMABI - Fix ABI issues for the genx backend.
+//
+Pass *createCMABIPass();
+
+//===----------------------------------------------------------------------===//
+//
+// CMLowerVLoadVStore - Lower CM reference loads and stores.
+//
+Pass *createCMLowerVLoadVStorePass();
+
+FunctionPass *createGenXReduceIntSizePass();
+FunctionPass *createGenXRegionCollapsingPass();
+FunctionPass *createGenXSimplifyPass();
+FunctionPass *createGenXLayoutBlocksPass();
+FunctionPass *createGenXLowerAggrCopiesPass();
+
+ModulePass *createGenXPacketizePass();
+} // End llvm namespace
+
+#endif
diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/Utils/CMRegion.h b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/CMRegion.h
new file mode 100644
index 000000000000..421cbbd631d4
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/CMRegion.h
@@ -0,0 +1,237 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// CMRegion : region information
+/// -------------------------------
+/// 
+/// An object of class CMRegion describes the region parameters of a Gen region.
+/// It is a transient object, in that a pass constructs it as needed and then
+/// forgets it. It does not persist between passes, as the region parameters are
+/// fully described by the arguments to the rdregion and wrregion intrinsics.
+///
+/// The region parameters in a CMRegion are:
+///
+/// * ElementBytes : number of bytes per element
+/// * ElementTy : Type of element
+/// * NumElements : total number of elements in the region (number of rows is
+///   thus NumElements/Width)
+/// * VStride : vertical stride in elements
+/// * Width : row width in elements
+/// * Stride : horizontal stride in elements
+/// * Offset : constant part of offset
+/// * Indirect : variable index (nullptr for direct region, scalar value for
+///   single indirect, vector value for multi indirect)
+/// * IndirectIdx : start index in vector indirect. This is always 0 when
+///   constructing a CMRegion, but can be set to a non-zero value before
+///   calling a method to create a new rdregion/wrregion intrinsic
+/// * IndirectAddrOffset : offset from the address value where region
+///   origin starts
+/// * Mask : mask (predicate) for wrregion, nullptr if none
+/// * ParentWidth : the parent width value (a statement that no row crosses a
+///   boundary of a multiple of this number of elements)
+///
+/// There are the following constructors:
+///
+/// * Construct from a Type or Value, setting the GenXRegion to a region that
+///   covers the whole value.
+/// * Construct from a rdregion/wrregion intrinsic, setting the GenXRegion to the
+///   region described by the intrinsic.
+/// * Construct from a bitmap of which elements need to be in the region. This
+///   is used from GenXConstants when constructing a splat region when loading
+///   a constant in multiple stages.
+/// 
+/// CMRegion is not used to represent the region parameters in predicate regions,
+/// since they are much simpler. But GenXRegion does contain static methods to create
+/// rdpredregion etc intrinsics given the predicate region parameters.
+/// 
+//===----------------------------------------------------------------------===//
+
+#ifndef CMREGION_H
+#define CMREGION_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+
+namespace llvm {
+
+class Constant;
+class DataLayout;
+class Value;
+class Function;
+class Module;
+class Type;
+class Instruction;
+class raw_ostream;
+class Twine;
+class DebugLoc;
+class TargetLibraryInfo;
+
+// CMRegion : description of an operand's region
+class CMRegion {
+public:
+  unsigned ElementBytes;
+  Type *ElementTy;
+  unsigned NumElements;
+  int VStride;
+  unsigned Width;
+  int Stride;
+  int Offset;
+  Value *Indirect;
+  unsigned IndirectIdx; // start index in vector Indirect
+  unsigned IndirectAddrOffset;
+  Value *Mask;          // 0 else mask for wrregion
+  unsigned ParentWidth; // 0 else parent width
+  // Default constructor: assume single element
+  CMRegion()
+      : ElementBytes(0), ElementTy(0), NumElements(1), VStride(1), Width(1),
+        Stride(1), Offset(0), Indirect(0), IndirectIdx(0),
+        IndirectAddrOffset(0), Mask(0), ParentWidth(0) {}
+  // Construct from a type.
+  CMRegion(Type *Ty, const DataLayout *DL = nullptr);
+  // Construct from a value.
+  CMRegion(Value *V, const DataLayout *DL = nullptr);
+  // Construct from a rd/wr region/element
+  CMRegion(Instruction *Inst, bool WantParentWidth = false);
+  // Construct from a bitmap of which elements to set (legal 1D region)
+  CMRegion(unsigned Bits, unsigned ElementBytes);
+  // Create rdregion intrinsic from this Region
+  // Returns a scalar if the Region has one element and AllowScalar is true.
+  // Otherwise returns a vector.
+  Instruction *createRdRegion(Value *Input, const Twine &Name,
+                              Instruction *InsertBefore, const DebugLoc &DL,
+                              bool AllowScalar = false);
+  // Modify Region object for a subregion
+  void getSubregion(unsigned StartIdx, unsigned Size);
+  // Create wrregion intrinsic from this Region
+  Value *createWrRegion(Value *OldVal, Value *Input, const Twine &Name,
+                        Instruction *InsertBefore, const DebugLoc &DL);
+  // Create wrconstregion intrinsic from this Region
+  Value *createWrConstRegion(Value *OldVal, Value *Input, const Twine &Name,
+                             Instruction *InsertBefore, const DebugLoc &DL);
+  // Create rdpredregion from given start index and size
+  static Instruction *createRdPredRegion(Value *Input, unsigned Index,
+                                         unsigned Size, const Twine &Name,
+                                         Instruction *InsertBefore,
+                                         const DebugLoc &DL);
+  static Value *createRdPredRegionOrConst(Value *Input, unsigned Index,
+                                          unsigned Size, const Twine &Name,
+                                          Instruction *InsertBefore,
+                                          const DebugLoc &DL);
+  // Create wrpredregion from given start index
+  static Instruction *createWrPredRegion(Value *OldVal, Value *Input,
+                                         unsigned Index, const Twine &Name,
+                                         Instruction *InsertBefore,
+                                         const DebugLoc &DL);
+  // Create wrpredpredregion from given start index
+  static Instruction *createWrPredPredRegion(Value *OldVal, Value *Input,
+                                             unsigned Index, Value *Pred,
+                                             const Twine &Name,
+                                             Instruction *InsertBefore,
+                                             const DebugLoc &DL);
+  // Set the called function in an intrinsic call
+  static void setRegionCalledFunc(Instruction *Inst);
+  // Compare two regions to see if they have the same region parameters other
+  // than start offset (not allowing element type to be different).
+  bool isStrictlySimilar(const CMRegion &R2) const {
+    return VStride == R2.VStride && Width == R2.Width && Stride == R2.Stride &&
+           Mask == R2.Mask;
+  }
+  // Compare two regions to see if they have the same region parameters other
+  // than start offset (also allowing element type to be different).
+  bool isSimilar(const CMRegion &R2) const;
+  // Compare two regions to see if they have the same region parameters (also
+  // allowing element type to be different).
+  bool operator==(const CMRegion &R2) const {
+    return isSimilar(R2) && Offset == R2.Offset && Indirect == R2.Indirect
+        && IndirectIdx == R2.IndirectIdx;
+  }
+  bool operator!=(const CMRegion &R2) const { return !(*this == R2); }
+  // Compare two regions to see if they overlaps each other.
+  bool overlap(const CMRegion &R2) const;
+  // Test whether a region is scalar
+  bool isScalar() const {
+    return !Stride && (Width == NumElements || !VStride);
+  }
+  // Test whether a region is 2D
+  bool is2D() const { return !isScalar() && Width != NumElements; }
+  // Test whether a region is contiguous.
+  bool isContiguous() const;
+  // Test whether a region covers exactly the whole of the given type, allowing
+  // for the element type being different.
+  bool isWhole(Type *Ty) const;
+  // Test whether the region has a whole number of rows. (append() can result
+  // in a region with an incomplete final row, which is normally not allowed.)
+  bool isWholeNumRows() const { return !(NumElements % Width); }
+  // Evaluate rdregion with constant input.
+  Constant *evaluateConstantRdRegion(Constant *Input, bool AllowScalar);
+  // evaluateConstantWrRegion : evaluate wrregion with constant inputs
+  Constant *evaluateConstantWrRegion(Constant *OldVal, Constant *NewVal);
+  // append : append region AR to this region
+  bool append(CMRegion AR);
+  // changeElementType : change the element type of the region
+  bool changeElementType(Type *NewElementType);
+  // Debug dump/print
+  void dump() const;
+  void print(raw_ostream &OS) const;
+  // Check whether the region is multi indirect. Returns true if Indirect has
+  // VectorType (a sign of multi indirection)
+  bool isMultiIndirect() const {
+    return Indirect && isa<VectorType>(Indirect->getType());
+  }
+  // Get bit mask in which ones values represent bytes which
+  // were accessed by this region
+  BitVector getAccessBitMap(int MinTrackingOffset = 0) const;
+  // Length of single row in bytes
+  unsigned getRowLength() const {
+    return Stride ? (Width * Stride * ElementBytes) : ElementBytes;
+  }
+  // Length of whole region in bytes
+  unsigned getLength() const {
+    return VStride * ((NumElements / Width) - 1) * ElementBytes +
+                getRowLength();
+  }
+
+protected:
+  // Create wrregion or wrconstregion intrinsic from this Region
+  Value *createWrCommonRegion(GenXIntrinsic::ID, Value *OldVal, Value *Input,
+                              const Twine &Name, Instruction *InsertBefore,
+                              const DebugLoc &DL);
+  // Get the function declaration for a region intrinsic
+  static Function *getGenXRegionDeclaration(Module *M, GenXIntrinsic::ID IID, Type *RetTy,
+                                        ArrayRef<Value *> Args);
+  // Get (or create instruction for) the start index of a region.
+  Value *getStartIdx(const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL);
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const CMRegion &R) {
+  R.print(OS);
+  return OS;
+}
+
+} // end namespace llvm 
+
+#endif /* CMREGION_H */
diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/Utils/GenXSTLExtras.h b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/GenXSTLExtras.h
new file mode 100644
index 000000000000..b7a404dabb89
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/GenXSTLExtras.h
@@ -0,0 +1,80 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef LLVM_GENXOPTS_UTILS_GENXSTLEXTRAS_H
+#define LLVM_GENXOPTS_UTILS_GENXSTLEXTRAS_H
+
+#include <iterator>
+
+namespace llvm {
+namespace genx {
+
+namespace ranges {
+
+template <typename Range>
+using iterator_t = decltype(std::begin(std::declval<Range&>()));
+
+template <typename Range>
+using range_pointer_t =
+    typename std::iterator_traits<iterator_t<Range>>::pointer;
+
+template <typename Range>
+using range_reference_t =
+    typename std::iterator_traits<iterator_t<Range>>::reference;
+
+} // namespace ranges
+
+/* Returns the first iterator (let's name it RetIt) such that
+ * std::accumulate(First, RetIt, 0) > Bound (not full truth, read below).
+ *
+ * Arguments:
+ * \p First, \p Last - considered range
+ * \p Bound - considered Bound
+ * \p Op - functor that returns T, takes T and decltype(*First)
+ *    respectively as arguments. It is meant to increment current partial sum.
+ *    First argument is previous partial sum, second argument is upcoming value
+ *    from the range, new partial sum is returned.
+ *
+ * Arguments of \p PlusEqualOp may not be equal, so the range may possibly point
+ * not to T type. In this case partial sum is calculated for transformed range
+ * (transformation is hidden in \p Op).
+ */
+template <typename ForwardIt, typename PlusEqualOp, typename T>
+ForwardIt upper_partial_sum_bound(ForwardIt First, ForwardIt Last, T Bound,
+                                  PlusEqualOp Op) {
+  T CurSum = 0;
+  for (; First != Last; ++First) {
+    CurSum = Op(CurSum, *First);
+    if (CurSum > Bound)
+      return First;
+  }
+  return Last;
+}
+
+} // namespace genx
+} // namespace llvm
+
+#endif // LLVM_GENXOPTS_UTILS_GENXSTLEXTRAS_H
diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/Utils/KernelInfo.h b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/KernelInfo.h
new file mode 100644
index 000000000000..b6b5876c97ac
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/KernelInfo.h
@@ -0,0 +1,356 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef GENX_KERNEL_INFO_H
+#define GENX_KERNEL_INFO_H
+
+#include "vc/GenXOpts/Utils/RegCategory.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/GenXIntrinsics/GenXMetadata.h"
+
+namespace llvm {
+namespace genx {
+
+enum { VISA_MAJOR_VERSION = 3, VISA_MINOR_VERSION = 6 };
+
+// Utility function to tell whether a Function is a vISA kernel.
+inline bool isKernel(const Function *F) {
+  // We use DLLExport to represent a kernel in LLVM IR.
+  return (F->hasDLLExportStorageClass() ||
+          F->hasFnAttribute(genx::FunctionMD::CMGenXMain));
+}
+
+// Turn a MDNode into llvm::value or its subclass.
+// Return nullptr if the underlying value has type mismatch.
+template <typename Ty = llvm::Value> Ty *getValueAsMetadata(Metadata *M) {
+  if (auto VM = dyn_cast<ValueAsMetadata>(M))
+    if (auto V = dyn_cast<Ty>(VM->getValue()))
+      return V;
+  return nullptr;
+}
+
+/// KernelMetadata : class to parse kernel metadata
+class KernelMetadata {
+  Function *F = nullptr;
+  bool IsKernel = false;
+  StringRef Name;
+  unsigned SLMSize = 0;
+  SmallVector<unsigned, 4> ArgKinds;
+  SmallVector<unsigned, 4> ArgOffsets;
+  SmallVector<unsigned, 4> ArgIOKinds;
+  SmallVector<StringRef, 4> ArgTypeDescs;
+  // Assign a BTI value to a surface or sampler, OCL path only.
+  // Given buffer x,                       --> UAV
+  //       read_only image                 --> SRV
+  //       write_only or read_write image  --> UAV
+  //
+  // First assign SRV then UAV resources.
+  SmallVector<int32_t, 4> BTIs;
+
+public:
+  // default constructor
+  KernelMetadata() {}
+
+  /*
+   * KernelMetadata constructor
+   *
+   * Enter:   F = Function that purports to be a CM kernel
+   *
+   */
+  KernelMetadata(Function *F) {
+    if (!genx::isKernel(F))
+      return;
+    NamedMDNode *Named =
+        F->getParent()->getNamedMetadata(genx::FunctionMD::GenXKernels);
+    if (!Named)
+      return;
+
+    MDNode *Node = nullptr;
+    for (unsigned i = 0, e = Named->getNumOperands(); i != e; ++i) {
+      if (i == e)
+        return;
+      Node = Named->getOperand(i);
+      if (Node->getNumOperands() > KernelMDOp::ArgTypeDescs &&
+          getValueAsMetadata(Node->getOperand(KernelMDOp::FunctionRef)) == F)
+        break;
+    }
+    if (!Node)
+      return;
+
+    // Node is the metadata node for F, and it has the required number of
+    // operands.
+    this->F = F;
+    IsKernel = true;
+    if (MDString *MDS = dyn_cast<MDString>(Node->getOperand(KernelMDOp::Name)))
+      Name = MDS->getString();
+    if (ConstantInt *Sz = getValueAsMetadata<ConstantInt>(Node->getOperand(KernelMDOp::SLMSize)))
+      SLMSize = Sz->getZExtValue();
+    // Build the argument kinds and offsets arrays that should correspond to the
+    // function arguments (both explicit and implicit)
+    MDNode *KindsNode = dyn_cast<MDNode>(Node->getOperand(KernelMDOp::ArgKinds));
+    MDNode *OffsetsNode = dyn_cast<MDNode>(Node->getOperand(KernelMDOp::ArgOffsets));
+    MDNode *InputOutputKinds = dyn_cast<MDNode>(Node->getOperand(KernelMDOp::ArgIOKinds));
+    MDNode *ArgDescNode = dyn_cast<MDNode>(Node->getOperand(KernelMDOp::ArgTypeDescs));
+
+    assert(KindsNode);
+
+    for (unsigned i = 0, e = KindsNode->getNumOperands(); i != e; ++i) {
+      ArgKinds.push_back(
+          getValueAsMetadata<ConstantInt>(KindsNode->getOperand(i))
+              ->getZExtValue());
+      if (OffsetsNode == nullptr)
+        ArgOffsets.push_back(0);
+      else {
+        assert(OffsetsNode->getNumOperands() == e && "out of sync");
+        ArgOffsets.push_back(
+            getValueAsMetadata<ConstantInt>(OffsetsNode->getOperand(i))
+                ->getZExtValue());
+      }
+    }
+    assert(InputOutputKinds &&
+           KindsNode->getNumOperands() >= InputOutputKinds->getNumOperands());
+    for (unsigned i = 0, e = InputOutputKinds->getNumOperands(); i != e; ++i)
+      ArgIOKinds.push_back(
+          getValueAsMetadata<ConstantInt>(InputOutputKinds->getOperand(i))
+              ->getZExtValue());
+    assert(ArgDescNode);
+    for (unsigned i = 0, e = ArgDescNode->getNumOperands(); i < e; ++i) {
+      MDString *MDS = dyn_cast<MDString>(ArgDescNode->getOperand(i));
+      assert(MDS);
+      ArgTypeDescs.push_back(MDS->getString());
+    }
+  }
+  // Accessors
+  bool isKernel() const { return IsKernel; }
+  StringRef getName() const { return Name; }
+  unsigned getSLMSize() const { return SLMSize; }
+  ArrayRef<unsigned> getArgKinds() const { return ArgKinds; }
+  unsigned getNumArgs() const { return ArgKinds.size(); }
+  unsigned getArgKind(unsigned Idx) const { return ArgKinds[Idx]; }
+  StringRef getArgTypeDesc(unsigned Idx) const {
+    if (Idx >= ArgTypeDescs.size())
+      return "";
+    return ArgTypeDescs[Idx];
+  }
+
+  enum { AK_NORMAL, AK_SAMPLER, AK_SURFACE, AK_VME };
+  unsigned getArgCategory(unsigned Idx) const {
+    switch (getArgKind(Idx) & 7) {
+    case AK_SAMPLER:
+      return RegCategory::SAMPLER;
+    case AK_SURFACE:
+      return RegCategory::SURFACE;
+    case AK_VME:
+      return RegCategory::VME;
+    default:
+      return RegCategory::GENERAL;
+    }
+  }
+
+  // check if an argument is annotated with attribute "buffer_t".
+  bool isBufferType(unsigned Idx) const {
+    return (getArgTypeDesc(Idx).find_lower("buffer_t") != StringRef::npos &&
+      getArgTypeDesc(Idx).find_lower("image1d_buffer_t") == StringRef::npos);
+  }
+
+  // check if an argument is annotated with attribute "image{1,2,3}d_t".
+  bool isImageType(unsigned Idx) const {
+    return getArgTypeDesc(Idx).find_lower("image1d_t") != StringRef::npos ||
+           getArgTypeDesc(Idx).find_lower("image2d_t") != StringRef::npos ||
+           getArgTypeDesc(Idx).find_lower("image3d_t") != StringRef::npos ||
+           getArgTypeDesc(Idx).find_lower("image1d_buffer_t") != StringRef::npos;
+  }
+
+  int32_t getBTI(unsigned Index) {
+    if (BTIs.empty())
+      computeBTIs();
+    assert(Index < BTIs.size());
+    return BTIs[Index];
+  }
+
+  enum {
+    // Reserved surface indices start from 253, see GenXCodeGen/GenXVisa.h
+    // TODO: consider adding a dependency from GenXCodeGen and extract
+    // "252" from there
+    K_MaxAvailableBtiIndex = 252
+  };
+  // Assign BTIs lazily.
+  void computeBTIs() {
+    unsigned SurfaceID = 0;
+    unsigned SamplerID = 0;
+    auto Desc = ArgTypeDescs.begin();
+    // Assign SRV and samplers.
+    for (auto Kind = ArgKinds.begin(); Kind != ArgKinds.end(); ++Kind) {
+      BTIs.push_back(-1);
+      if (*Kind == AK_SAMPLER)
+        BTIs.back() = SamplerID++;
+      else if (*Kind == AK_SURFACE) {
+        StringRef DescStr = *Desc;
+        // By default, an unannotated surface is read_write.
+        if (DescStr.find_lower("read_only") != StringRef::npos) {
+          BTIs.back() = SurfaceID++;
+          if (SurfaceID > K_MaxAvailableBtiIndex) {
+            llvm::report_fatal_error("not enough BTI indeces", false);
+          }
+        }
+      }
+      ++Desc;
+    }
+    // Scan again and assign BTI to UAV resources.
+    Desc = ArgTypeDescs.begin();
+    int Idx = 0;
+    for (auto Kind = ArgKinds.begin(); Kind != ArgKinds.end(); ++Kind) {
+      if (*Kind == AK_SURFACE && BTIs[Idx] == -1)
+        BTIs[Idx] = SurfaceID++;
+      // SVM arguments are also assigned an BTI, which is not necessary, but OCL
+      // runtime requires it.
+      if (*Kind == AK_NORMAL) {
+        StringRef DescStr = *Desc;
+        if (DescStr.find_lower("svmptr_t") != StringRef::npos) {
+          BTIs[Idx] = SurfaceID++;
+          if (SurfaceID > K_MaxAvailableBtiIndex) {
+            llvm::report_fatal_error("not enough BTI indeces", false);
+          }
+        }
+      }
+      // print buffer is also assigned with BTI, which is not necessary, but OCL
+      // runtime requires it.
+      if (*Kind & KernelMetadata::IMP_OCL_PRINTF_BUFFER) {
+        BTIs[Idx] = SurfaceID++;
+      }
+
+      if (*Kind & KernelMetadata::IMP_OCL_PRIVATE_BASE)
+        BTIs[Idx] = SurfaceID++;
+      ++Desc, ++Idx;
+    }
+  }
+
+  // All the Kinds defined
+  // These correspond to the values used in vISA
+  // Bits 0-2 represent category (see enum)
+  // Bits 7..3 represent the value needed for the runtime to determine what
+  //           the implicit argument should be
+  //
+  // IMP_OCL_LOCAL_ID{X, Y, Z} and IMP_OCL_GLOBAL_OR_LOCAL_SIZE apply to OCL
+  // runtime only.
+  //
+  enum ImpValue : uint32_t {
+    IMP_NONE = 0x0,
+    IMP_LOCAL_SIZE = 0x1 << 3,
+    IMP_GROUP_COUNT = 0x2 << 3,
+    IMP_LOCAL_ID = 0x3 << 3,
+    IMP_SB_DELTAS = 0x4 << 3,
+    IMP_SB_BTI = 0x5 << 3,
+    IMP_SB_DEPCNT = 0x6 << 3,
+    IMP_OCL_LOCAL_ID_X = 0x7 << 3,
+    IMP_OCL_LOCAL_ID_Y = 0x8 << 3,
+    IMP_OCL_LOCAL_ID_Z = 0x9 << 3,
+    IMP_OCL_GROUP_OR_LOCAL_SIZE = 0xA << 3,
+    IMP_OCL_PRINTF_BUFFER = 0xB << 3,
+    IMP_OCL_PRIVATE_BASE = 0xC << 3,
+    IMP_PSEUDO_INPUT = 0x10 << 3
+  };
+
+  enum { SKIP_OFFSET_VAL = -1 };
+  // Check if this argument should be omitted as a kernel input.
+  bool shouldSkipArg(unsigned Idx) const {
+    return static_cast<int32_t>(ArgOffsets[Idx]) == SKIP_OFFSET_VAL;
+  }
+  unsigned getNumNonSKippingInputs() const {
+    unsigned K = 0;
+    for (unsigned Val : ArgOffsets)
+      K += (static_cast<int32_t>(Val) != SKIP_OFFSET_VAL);
+    return K;
+  }
+  unsigned getArgOffset(unsigned Idx) const { return ArgOffsets[Idx]; }
+
+  enum ArgIOKind {
+    IO_Normal = 0,
+    IO_INPUT = 1,
+    IO_OUTPUT = 2,
+    IO_INPUT_OUTPUT = 3
+  };
+  ArgIOKind getArgInputOutputKind(unsigned Idx) const {
+    if (Idx < ArgIOKinds.size())
+      return static_cast<ArgIOKind>(ArgIOKinds[Idx] & 0x3);
+    return IO_Normal;
+  }
+  bool isOutputArg(unsigned Idx) const {
+    auto Kind = getArgInputOutputKind(Idx);
+    return Kind == ArgIOKind::IO_OUTPUT || Kind == ArgIOKind::IO_INPUT_OUTPUT;
+  }
+};
+
+struct KernelArgInfo {
+  uint32_t Kind;
+  explicit KernelArgInfo(uint32_t Kind) : Kind(Kind) {}
+  bool isNormalCategory() const {
+    return (Kind & 0x7) == genx::KernelMetadata::AK_NORMAL;
+  }
+  bool isLocalIDX() const {
+    uint32_t Val = Kind & 0xFFF8;
+    return Val == genx::KernelMetadata::IMP_OCL_LOCAL_ID_X;
+  }
+  bool isLocalIDY() const {
+    uint32_t Val = Kind & 0xFFF8;
+    return Val == genx::KernelMetadata::IMP_OCL_LOCAL_ID_Y;
+  }
+  bool isLocalIDZ() const {
+    uint32_t Val = Kind & 0xFFF8;
+    return Val == genx::KernelMetadata::IMP_OCL_LOCAL_ID_Z;
+  }
+  bool isGroupOrLocalSize() const {
+    uint32_t Val = Kind & 0xFFF8;
+    return Val == genx::KernelMetadata::IMP_OCL_GROUP_OR_LOCAL_SIZE;
+  }
+  bool isLocalIDs() const {
+    uint32_t Val = Kind & 0xFFF8;
+    return Val == genx::KernelMetadata::IMP_LOCAL_ID;
+  }
+  bool isLocalSize() const {
+    uint32_t Val = Kind & 0xFFF8;
+    return Val == genx::KernelMetadata::IMP_LOCAL_SIZE;
+  }
+  bool isGroupCount() const {
+    uint32_t Val = Kind & 0xFFF8;
+    return Val == genx::KernelMetadata::IMP_GROUP_COUNT;
+  }
+  bool isPrintBuffer() const {
+    uint32_t Val = Kind & 0xFFF8;
+    return Val == genx::KernelMetadata::IMP_OCL_PRINTF_BUFFER;
+  }
+  bool isPrivateBase() const {
+    uint32_t Val = Kind & 0xFFF8;
+    return Val == genx::KernelMetadata::IMP_OCL_PRIVATE_BASE;
+  }
+};
+
+} // namespace genx
+} // namespace llvm
+
+#endif
diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/Utils/RegCategory.h b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/RegCategory.h
new file mode 100644
index 000000000000..5d7d9ad33405
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/RegCategory.h
@@ -0,0 +1,55 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef GENX_REG_CATEGORIES_H
+#define GENX_REG_CATEGORIES_H
+
+namespace llvm {
+namespace genx {
+
+// The encoding for register category, used in GenXCategory,
+// GenXLiveness and GenXVisaRegAlloc.  It is an anonymous enum inside a class
+// rather than a named enum so you don't need to cast to/from int.
+struct RegCategory {
+  enum {
+    NONE,
+    GENERAL,
+    ADDRESS,
+    PREDICATE,
+    SAMPLER,
+    SURFACE,
+    VME,
+    NUMREALCATEGORIES,
+    EM,
+    RM,
+    NUMCATEGORIES
+  };
+};
+
+} // namespace genx
+} // namespace llvm
+
+#endif
diff --git a/IGC/VectorCompiler/include/vc/Support/CMakeLists.txt b/IGC/VectorCompiler/include/vc/Support/CMakeLists.txt
new file mode 100644
index 000000000000..4079289ba9f9
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/Support/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS Options.td)
+tablegen(LLVM Options.inc -gen-opt-parser-defs)
+add_public_tablegen_target(VectorCompilerOptions)
diff --git a/IGC/VectorCompiler/include/vc/Support/Options.h b/IGC/VectorCompiler/include/vc/Support/Options.h
new file mode 100644
index 000000000000..9ff203af166a
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/Support/Options.h
@@ -0,0 +1,58 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef VC_SUPPORT_OPTIONS_H
+#define VC_SUPPORT_OPTIONS_H
+
+#include <llvm/Option/OptTable.h>
+
+namespace vc {
+namespace options {
+// Flags should not overlap with llvm::opt::DriverFlag.
+constexpr unsigned FirstNonBuiltinFlagNum = 4;
+
+enum Flags {
+  ApiOption = (1 << FirstNonBuiltinFlagNum),
+  InternalOption = (ApiOption << 1),
+  IgcmcApiOption = (InternalOption << 1),
+};
+
+enum ID {
+  OPT_INVALID = 0,
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OPT_##ID,
+#include "vc/Support/Options.inc"
+  LastOption
+#undef OPTION
+};
+
+} // namespace options
+
+const llvm::opt::OptTable &getOptTable();
+} // namespace vc
+
+#endif
diff --git a/IGC/VectorCompiler/include/vc/Support/Options.td b/IGC/VectorCompiler/include/vc/Support/Options.td
new file mode 100644
index 000000000000..c79718c955fb
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/Support/Options.td
@@ -0,0 +1,117 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file defines the options accepted by vector compiler.
+//
+// There are two kinds of options: api options and internal options.
+//
+// Api options are exposed to user via, e.g., openCL clBuildProgram.
+//
+// Internal options are for passing of additional info of various purposes.
+// Among these can be: debug, things that are not exposed to user directly.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Option/OptParser.td"
+
+// Option kinds {{
+// Options accessible using API.
+def ApiOption : OptionFlag;
+
+// Api options compatible with igcmc.
+// These are used only when -cmc is present in api options.
+def IgcmcApiOption : OptionFlag;
+
+// Internal options.
+def InternalOption : OptionFlag;
+// }} Option kinds
+
+// Api options {{
+let Flags = [ApiOption] in {
+
+// Main dispatch option.
+def vc_codegen : Flag<["-"], "vc-codegen">,
+  HelpText<"Enable vector codegenerator">;
+
+def optimize : Separate<["-"], "optimize">,
+  HelpText<"Set optimization level to either 'none' or 'full'">,
+  MetaVarName<"<level>">;
+def optimize_eq : Joined<["-"], "optimize=">,
+  Alias<optimize>;
+def no_optimize : Flag<["-"], "no-optimize">,
+  Alias<optimize>, AliasArgs<["none"]>;
+
+def no_vector_decomposition : Flag<["-"], "no-vector-decomposition">,
+  HelpText<"Disable vector decomposition pass">;
+// Old igcmc alias.
+def no_vector_decomposition_old : Flag<["-"], "no_vector_decomposition">,
+  Alias<no_vector_decomposition> {
+  let Flags = [ApiOption, IgcmcApiOption];
+}
+
+}
+// }} Api options
+
+// Igcmc compatibility {{
+let Flags = [IgcmcApiOption] in {
+
+def igcmc : Flag<["-"], "cmc">,
+  HelpText<"Enable igcmc compatible mode; incompatible with -vc-codegen; implies -optimize=none.">;
+
+def igcmc_visaopts : Joined<["-"], "visaopts=">,
+  HelpText<"Options for finalizer in form \"opt1 opt2 opt3...\"">;
+
+def igcmc_stack_size : Joined<["-"], "stack-mem-size=">,
+  HelpText<"Control stack memory size (in bytes)">;
+
+}
+// }} Igcmc compatibility
+
+// Internal options {{
+let Flags = [InternalOption] in {
+
+def dump_llvm_ir : Flag<["-"], "dump-llvm-ir">,
+  HelpText<"Dump llvm IR after SPIRV reading, optimizations and codegen">;
+def dump_isa_binary : Flag<["-"], "dump-isa-binary">,
+  HelpText<"Dump isa binary after finalization pass">;
+
+def help : Flag<["-"], "help">,
+  HelpText<"Display available API options">;
+def help_internal : Flag<["-"], "help-internal">,
+  HelpText<"Display available internal options">;
+
+def llvm_options : Separate<["-"], "llvm-options">,
+  HelpText<"Additional options forwarded to llvm CommandLine global option parser">;
+def llvm_options_eq : Joined<["-"], "llvm-options=">,
+  Alias<llvm_options>, HelpText<"Alias for -llvm-options">;
+
+def runtime : Separate<["-"], "runtime">,
+  HelpText<"Set runtime for which binary should be generated; values: 'ocl' or 'cm'">;
+def runtime_eq : Joined<["-"], "runtime=">,
+  Alias<runtime>, HelpText<"Alias for -runtime <value>">;
+
+}
+// }} Internal options
diff --git a/IGC/VectorCompiler/include/vc/Support/Status.h b/IGC/VectorCompiler/include/vc/Support/Status.h
new file mode 100644
index 000000000000..7e261a2f1fe8
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/Support/Status.h
@@ -0,0 +1,158 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef VC_SUPPORT_STATUS_H
+#define VC_SUPPORT_STATUS_H
+
+#include "vc/Support/StatusCode.h"
+
+#include <llvm/Support/Error.h>
+
+#include <system_error>
+
+namespace vc {
+
+class DynLoadError final : public llvm::ErrorInfo<DynLoadError> {
+public:
+  static char ID;
+
+private:
+  std::string Message;
+
+public:
+  DynLoadError(llvm::StringRef Msg) : Message(Msg.str()) {}
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(errc::dynamic_load_fail);
+  }
+};
+
+class SymbolLookupError final : public llvm::ErrorInfo<SymbolLookupError> {
+public:
+  static char ID;
+
+private:
+  std::string Library;
+  std::string Symbol;
+
+public:
+  SymbolLookupError(llvm::StringRef Lib, llvm::StringRef Sym)
+      : Library(Lib.str()), Symbol(Sym.str()) {}
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(errc::symbol_not_found);
+  }
+};
+
+class BadSpirvError final : public llvm::ErrorInfo<BadSpirvError> {
+public:
+  static char ID;
+
+private:
+  std::string Message;
+
+public:
+  BadSpirvError(llvm::StringRef Msg) : Message(Msg.str()) {}
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(errc::bad_spirv);
+  }
+};
+
+class BadBitcodeError final : public llvm::ErrorInfo<BadBitcodeError> {
+public:
+  static char ID;
+
+private:
+  std::string Message;
+
+public:
+  BadBitcodeError(llvm::StringRef Msg) : Message(Msg.str()) {}
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(errc::bad_bitcode);
+  }
+};
+
+class InvalidModuleError final : public llvm::ErrorInfo<InvalidModuleError> {
+public:
+  static char ID;
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(errc::invalid_module);
+  }
+};
+
+class TargetMachineError final : public llvm::ErrorInfo<TargetMachineError> {
+public:
+  static char ID;
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(errc::target_machine_not_created);
+  }
+};
+
+class NotVCError final : public llvm::ErrorInfo<NotVCError> {
+public:
+  static char ID;
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(errc::not_vc_codegen);
+  }
+};
+
+class OptionError final : public llvm::ErrorInfo<OptionError> {
+public:
+  static char ID;
+
+private:
+  std::string BadOption;
+  bool IsInternal;
+
+public:
+  OptionError(llvm::StringRef BadOpt, bool IsInternal_)
+      : BadOption(BadOpt.str()), IsInternal(IsInternal_) {}
+
+  bool isInternal() const { return IsInternal; }
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override {
+    const errc c =
+        IsInternal ? errc::invalid_internal_option : errc::invalid_api_option;
+    return make_error_code(c);
+  }
+};
+
+} // namespace vc
+
+#endif
diff --git a/IGC/VectorCompiler/include/vc/Support/StatusCode.h b/IGC/VectorCompiler/include/vc/Support/StatusCode.h
new file mode 100644
index 000000000000..471eafc97a01
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/Support/StatusCode.h
@@ -0,0 +1,75 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef VC_SUPPORT_STATUSCODE_H
+#define VC_SUPPORT_STATUSCODE_H
+
+#include <system_error>
+
+namespace vc {
+
+enum class errc {
+  // DynamicLibrary::getPermanentLibrary failure.
+  dynamic_load_fail = 1,
+
+  // DynamicLibrary::getAddressOfSymbol failure.
+  symbol_not_found,
+
+  // Spirv read failure.
+  bad_spirv,
+
+  // Parse bitcode failure.
+  bad_bitcode,
+
+  // Module verification failure.
+  invalid_module,
+
+  // Target machine allocation failure.
+  target_machine_not_created,
+
+  // VC codegen not specified in options.
+  not_vc_codegen,
+
+  // Bad option in api options.
+  invalid_api_option,
+
+  // Bad option in internal options.
+  invalid_internal_option,
+};
+
+const std::error_category &err_category() noexcept;
+
+inline std::error_code make_error_code(vc::errc e) noexcept {
+  return std::error_code(static_cast<int>(e), vc::err_category());
+}
+
+} // namespace vc
+
+namespace std {
+template <> struct is_error_code_enum<vc::errc> : std::true_type {};
+} // namespace std
+
+#endif
diff --git a/IGC/VectorCompiler/include/vc/Support/StatusTraits.h b/IGC/VectorCompiler/include/vc/Support/StatusTraits.h
new file mode 100644
index 000000000000..f75ab00be381
--- /dev/null
+++ b/IGC/VectorCompiler/include/vc/Support/StatusTraits.h
@@ -0,0 +1,85 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef VC_SUPPORT_STATUSTRAITS_H
+#define VC_SUPPORT_STATUSTRAITS_H
+
+#include "vc/Support/StatusCode.h"
+
+#include "llvm/ADT/StringRef.h"
+
+namespace vc {
+
+// There should be specialization for every error code listed in errc.
+// Specialization should define:
+//  * llvm::StringRef getMessage() // return description for error
+template <errc Code> struct ErrorTraits;
+
+template <> struct ErrorTraits<errc::dynamic_load_fail> {
+  static llvm::StringRef getMessage() {
+    return "failed to load dynamic library";
+  }
+};
+
+template <> struct ErrorTraits<errc::symbol_not_found> {
+  static llvm::StringRef getMessage() { return "symbol lookup error"; }
+};
+
+template <> struct ErrorTraits<errc::bad_spirv> {
+  static llvm::StringRef getMessage() { return "bad spirv bitcode"; }
+};
+
+template <> struct ErrorTraits<errc::bad_bitcode> {
+  static llvm::StringRef getMessage() { return "bad llvm bitcode"; }
+};
+
+template <> struct ErrorTraits<errc::invalid_module> {
+  static llvm::StringRef getMessage() { return "module verification failed"; }
+};
+
+template <> struct ErrorTraits<errc::target_machine_not_created> {
+  static llvm::StringRef getMessage() {
+    return "target machine creation failed";
+  }
+};
+
+template <> struct ErrorTraits<errc::not_vc_codegen> {
+  static llvm::StringRef getMessage() {
+    return "vc codegen path option was not specified";
+  }
+};
+
+template <> struct ErrorTraits<errc::invalid_api_option> {
+  static llvm::StringRef getMessage() { return "invalid api option"; }
+};
+
+template <> struct ErrorTraits<errc::invalid_internal_option> {
+  static llvm::StringRef getMessage() { return "invalid internal option"; }
+};
+
+} // namespace vc
+
+#endif
diff --git a/IGC/VectorCompiler/lib/BackendPlugin/BackendPlugin.cpp b/IGC/VectorCompiler/lib/BackendPlugin/BackendPlugin.cpp
new file mode 100644
index 000000000000..6b14e665c189
--- /dev/null
+++ b/IGC/VectorCompiler/lib/BackendPlugin/BackendPlugin.cpp
@@ -0,0 +1,36 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "vc/GenXCodeGen/GenXTarget.h"
+
+static int initializeAll() {
+  llvm::initializeGenX();
+  return 0;
+}
+
+// This will be initialized on plugin load.
+// Can cause problems if library is linked at compilation time.
+static const int Init = initializeAll();
diff --git a/IGC/VectorCompiler/lib/BackendPlugin/CMakeLists.txt b/IGC/VectorCompiler/lib/BackendPlugin/CMakeLists.txt
new file mode 100644
index 000000000000..06542850ca69
--- /dev/null
+++ b/IGC/VectorCompiler/lib/BackendPlugin/CMakeLists.txt
@@ -0,0 +1,39 @@
+set(BACKEND_PLUGIN_SOURCES
+  BackendPlugin.cpp
+  )
+
+add_library(VCBackendPlugin
+  MODULE
+  ${BACKEND_PLUGIN_SOURCES}
+  )
+
+# Hack to avoid transitive LLVM dependencies that will break
+# plugin because of duplicate global variables.
+# 'CODEGEN_LIBS' will consist of 'VCCodeGen' and all its direct dependencies.
+# This should be enough for now. In case of indirect dependencies on VectorCompiler
+# libraries, searching algorithm needs to be improved.
+# 'CODEGEN_LIBS_FILES' will be expanded to plain library names so cmake
+# will not add any transitive dependencies when target is linked against them.
+get_target_property(CODEGEN_LIBS VCCodeGen LINK_LIBRARIES)
+set(CODEGEN_LIBS VCCodeGen ${CODEGEN_LIBS})
+foreach(target ${CODEGEN_LIBS})
+  # Filter out interface libraries -- these will not produce any files.
+  get_target_property(TARGET_TYPE ${target} TYPE)
+  if(NOT ("${TARGET_TYPE}" STREQUAL "INTERFACE_LIBRARY"))
+    set(CODEGEN_LIBS_FILES ${CODEGEN_LIBS_FILES} "$<TARGET_FILE:${target}>")
+  endif()
+endforeach()
+
+# Cmake also does not add any dependencies for libraries
+# that are linked this way.
+add_dependencies(VCBackendPlugin
+  ${CODEGEN_LIBS}
+  )
+
+target_link_libraries(VCBackendPlugin
+  PRIVATE
+  VCHeaders
+  ${CODEGEN_LIBS_FILES}
+  # GenX_IR actually should be linked to LLVMGenXCodeGen.
+  GenX_IR
+  )
diff --git a/IGC/VectorCompiler/lib/CMakeLists.txt b/IGC/VectorCompiler/lib/CMakeLists.txt
new file mode 100644
index 000000000000..c67d9efb5f3f
--- /dev/null
+++ b/IGC/VectorCompiler/lib/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_subdirectory(GenXOpts)
+add_subdirectory(GenXCodeGen)
+add_subdirectory(Support)
+
+# Plugin support.
+# Only for linux.
+if(LLVM_ON_UNIX)
+  add_subdirectory(BackendPlugin)
+endif()
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/CMakeLists.txt b/IGC/VectorCompiler/lib/GenXCodeGen/CMakeLists.txt
new file mode 100644
index 000000000000..78e44fb1226d
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/CMakeLists.txt
@@ -0,0 +1,84 @@
+add_subdirectory(TargetInfo)
+add_subdirectory(Utils)
+
+set(LLVM_TARGET_DEFINITIONS GenX.td)
+tablegen(LLVM GenXGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(GenXCommonTableGen)
+
+set(CODEGEN_SOURCES
+  FunctionGroup.cpp
+  KillAnalysis.cpp
+  GenXAddressCommoning.cpp
+  GenXAggregatePseudoLowering.cpp
+  GenXAlignmentInfo.cpp
+  GenXAnalysisDumper.cpp
+  GenXArgIndirection.cpp
+  GenXBaling.cpp
+  GenXCategory.cpp
+  GenXCFSimplification.cpp
+  GenXCisaBuilder.cpp
+  GenXConstants.cpp
+  GenXCoalescing.cpp
+  GenXDeadVectorRemoval.cpp
+  GenXDepressurizer.cpp
+  GenXExtractVectorizer.cpp
+  GenXFuncPtrsLowering.cpp
+  GenXGotoJoin.cpp
+  GenXGEPLowering.cpp
+  GenXIMadPostLegalization.cpp
+  GenXInlineAsmLowering.cpp
+  GenXIntrinsics.cpp
+  GenXLayoutBlocks.cpp
+  GenXLegalization.cpp
+  GenXLiveRanges.cpp
+  GenXLiveness.cpp
+  GenXLowering.cpp
+  GenXLowerAggrCopies.cpp
+  GenXEmulate.cpp
+  GenXModule.cpp
+  GenXNumbering.cpp
+  GenXOCLInfoExtractor.cpp
+  GenXOCLRuntimeInfo.cpp
+  GenXPatternMatch.cpp
+  GenXPostLegalization.cpp
+  GenXPrinter.cpp
+  GenXPressureTracker.cpp
+  GenXPromoteArray.cpp
+  GenXThreadPrivateMemory.cpp
+  GenXPromotePredicate.cpp
+  GenXRawSendRipper.cpp
+  GenXReduceIntSize.cpp
+  GenXInstCombineCleanup.cpp
+  GenXRegion.cpp
+  GenXRegionCollapsing.cpp
+  GenXRematerialization.cpp
+  GenXSimdCFConformance.cpp
+  GenXSubtarget.cpp
+  GenXTargetMachine.cpp
+  GenXTidyControlFlow.cpp
+  GenXUnbaling.cpp
+  GenXUtil.cpp
+  GenXVectorDecomposer.cpp
+  GenXVisaRegAlloc.cpp
+  GenXWATable.cpp
+  GenXWrapper.cpp
+)
+
+add_library(VCCodeGen ${CODEGEN_SOURCES})
+add_dependencies(VCCodeGen
+  GenXUtilBuild
+  GenXCommonTableGen
+  )
+target_include_directories(VCCodeGen
+  PRIVATE
+  ${VISA_INCLUDE_DIRS}
+  ${CMAKE_CURRENT_BINARY_DIR}
+  )
+target_link_libraries(VCCodeGen
+  LLVMGenXIntrinsics
+
+  VCHeaders
+  VCTransforms
+  VCTargetInfo
+  VCSupport
+  )
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.cpp
new file mode 100644
index 000000000000..380e71ede96d
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.cpp
@@ -0,0 +1,671 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file implements FunctionGroup, FunctionGroupAnalysis and
+// FunctionGroupPass. See FunctionGroup.h for more details.
+//
+// The FunctionGroupPass part was adapted from CallGraphSCCPass.cpp.
+//
+// This file is currently in lib/Target/GenX, as that is the only place it
+// is used. It could be moved somewhere more general.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FunctionGroup.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/GenXIntrinsics/GenXMetadata.h"
+using namespace llvm;
+
+#include "llvmWrapper/IR/LegacyPassManagers.h"
+#include "llvmWrapper/IR/PassTimingInfo.h"
+
+
+#define DEBUG_TYPE "functiongroup-passmgr"
+
+/***********************************************************************
+ * FunctionGroupAnalysis implementation
+ */
+char FunctionGroupAnalysis::ID = 0;
+INITIALIZE_PASS(FunctionGroupAnalysis, "FunctionGroupAnalysis",
+                "FunctionGroupAnalysis", false, true /*analysis*/)
+
+ModulePass *llvm::createFunctionGroupAnalysisPass() {
+  initializeFunctionGroupAnalysisPass(*PassRegistry::getPassRegistry());
+  return new FunctionGroupAnalysis();
+}
+
+// clear : clear out the analysis
+void FunctionGroupAnalysis::clear() {
+  for (auto T : TypesToProcess)
+    GroupMap[T].clear();
+
+  for (auto i = begin(), e = end(); i != e; ++i)
+    delete *i;
+  for (auto i = NonMainGroups.begin(), e = NonMainGroups.end(); i != e; ++i)
+    delete *i;
+
+  Groups.clear();
+  NonMainGroups.clear();
+  M = nullptr;
+}
+
+FunctionGroup *FunctionGroupAnalysis::getGroup(Function *F, FGType Type) {
+  auto i = GroupMap[Type].find(F);
+  if (i == GroupMap[Type].end())
+    return nullptr;
+  return i->second;
+}
+
+// getGroup : get the FunctionGroup containing Function F, else 0
+FunctionGroup *FunctionGroupAnalysis::getGroup(Function *F) {
+  return getGroup(F, FGType::GROUP);
+}
+
+FunctionGroup *FunctionGroupAnalysis::getSubGroup(Function *F) {
+  return getGroup(F, FGType::SUBGROUP);
+}
+
+// getGroupForHead : get the FunctionGroup for which Function F is the
+// head, else 0
+FunctionGroup *FunctionGroupAnalysis::getGroupForHead(Function *F) {
+  auto FG = getGroup(F);
+  assert(FG->size());
+  if (*FG->begin() == F)
+    return FG;
+  return nullptr;
+}
+
+// replaceFunction : replace a Function in a FunctionGroup
+// An in-use iterator in the modified FunctionGroup remains valid.
+void FunctionGroupAnalysis::replaceFunction(Function *OldF, Function *NewF) {
+  for (auto T : TypesToProcess) {
+    auto OldFIt = GroupMap[T].find(OldF);
+    assert(OldFIt != GroupMap[T].end());
+    FunctionGroup *FG = OldFIt->second;
+    GroupMap[T].erase(OldFIt);
+    GroupMap[T][NewF] = FG;
+    for (auto i = FG->begin();; ++i) {
+      assert(i != FG->end());
+      if (*i == OldF) {
+        *i = NewF;
+        break;
+      }
+    }
+  }
+}
+
+// addToFunctionGroup : add Function F to FunctionGroup FG
+// Using this (rather than calling push_back directly on the FunctionGroup)
+// means that the mapping from F to FG will be created, and getGroup() will
+// work for this Function.
+void FunctionGroupAnalysis::addToFunctionGroup(FunctionGroup *FG, Function *F,
+                                               FGType Type) {
+  assert(FG);
+  assert(FG->getParent()->getModule() == M &&
+         "attaching to FunctionGroup from wrong Module");
+  assert(!GroupMap[Type][F] && "Function already attached to FunctionGroup");
+  GroupMap[Type][F] = FG;
+  FG->push_back(F);
+}
+
+// createFunctionGroup : create new FunctionGroup for which F is the head
+FunctionGroup *FunctionGroupAnalysis::createFunctionGroup(Function *F,
+                                                          FGType Type) {
+  auto FG = new FunctionGroup(this);
+  if (Type == FGType::GROUP)
+    Groups.push_back(FG);
+  else
+    NonMainGroups.push_back(FG);
+  addToFunctionGroup(FG, F, Type);
+  return FG;
+}
+
+// Returns true if pass is simple module pass,
+// e.g. it is neither FG pass nor function pass manager.
+static bool isModulePass(Pass *P) {
+  if (P->getPassKind() != PT_Module)
+    return false;
+  return !P->getAsPMDataManager();
+}
+
+static StringRef TypeToAttr(FunctionGroupAnalysis::FGType Type) {
+  switch (Type) {
+  case FunctionGroupAnalysis::FGType::GROUP:
+    return genx::FunctionMD::CMGenXMain;
+  case FunctionGroupAnalysis::FGType::SUBGROUP:
+    return genx::FunctionMD::CMStackCall;
+  default:
+    llvm_unreachable("Can't gen attribute for nox-existent FG type");
+    break;
+  }
+  return "";
+}
+
+bool FunctionGroupAnalysis::buildGroup(CallGraph &Callees, Function *F,
+                                       FunctionGroup *curGr, FGType Type) {
+  bool result = false;
+  LLVM_DEBUG(dbgs() << "process function " << F->getName() << " from " << curGr
+                    << ", type = " << Type << "\n");
+  if (Visited.count(F) > 0) {
+    if (GroupMap[Type].count(F) > 0 && GroupMap[Type][F] != curGr &&
+        !F->hasFnAttribute(TypeToAttr(Type))) {
+      ValueToValueMapTy VMap;
+      Function *ClonedFunc = CloneFunction(F, VMap);
+      LLVM_DEBUG(dbgs() << "Cloning: " << ClonedFunc->getName() << "\n");
+
+      result = true;
+
+      for (auto it = F->use_begin(); it != F->use_end();) {
+        Use *u = &*it++;
+        auto *CI = dyn_cast<CallInst>(u->getUser());
+        assert(CI);
+        if (GroupMap[Type][CI->getFunction()] == curGr)
+          *u = ClonedFunc;
+      }
+      for (auto T : TypesToProcess) {
+        if (T >= Type)
+          break;
+        addToFunctionGroup(getGroup(F, T), ClonedFunc, T);
+      }
+      addToFunctionGroup(curGr, ClonedFunc, Type);
+
+      for (auto &Callee : Callees[F]) {
+        if (Callee == F)
+          continue;
+        LLVM_DEBUG(dbgs() << "Next callee: " << Callee->getName() << "\n");
+        result |= buildGroup(Callees, Callee, curGr, Type);
+      }
+    }
+  } else if (!Visited.count(F)) {
+    Visited[F] = true;
+    // group is created either on a function with a corresponding attribute
+    // or on a root of a whole function tree that is kernel (genx_main)
+    if (F->hasFnAttribute(TypeToAttr(Type)) ||
+        F->hasFnAttribute(genx::FunctionMD::CMGenXMain)) {
+      LLVM_DEBUG(dbgs() << "Create new group of type " << Type << "\n");
+      curGr = createFunctionGroup(F, Type);
+    } else if (curGr) {
+      LLVM_DEBUG(dbgs() << "Add to group " << curGr->getHead()->getName()
+                        << " of type " << Type << "\n");
+      addToFunctionGroup(curGr, F, Type);
+    }
+    for (auto &Callee : Callees[F]) {
+      LLVM_DEBUG(dbgs() << "Next callee: " << Callee->getName() << "\n");
+      result |= buildGroup(Callees, Callee, curGr, Type);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "finish processing function " << F->getName()
+                    << " on level " << Type << "\n");
+  return result;
+}
+
+//===----------------------------------------------------------------------===//
+// FGPassManager
+//
+/// FGPassManager manages FPPassManagers and FunctionGroupPasses.
+/// It actually now imitates MPPassManager because there is no way
+/// to extend pass manager structure without modification of
+/// LLVM pass managers code.
+/// This pass is injected into pass manager stack instead of top-level
+/// MPPassManager when there is first time FunctionGroupPass is created.
+/// After this manager replaces MPPassManager, it handles all Module and
+/// FunctionGroup passes. This manager itself is module pass so it is
+/// actually contained in list of module passes of module pass manager
+/// as last pass that should be run. However, top-level pass manager do
+/// not know anything about this FGPassManager except that it is indirect
+/// pass manager, so it will not run it directly.
+
+namespace {
+
+class FGPassManager : public ModulePass, public IGCLLVM::PMDataManager {
+public:
+  static char ID;
+  explicit FGPassManager() : ModulePass(ID), IGCLLVM::PMDataManager() {}
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool runOnModule(Module &M) override;
+
+  bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const override {
+    // FGPassManager needs FunctionGroupAnalysis.
+    Info.addRequired<FunctionGroupAnalysis>();
+    Info.setPreservesAll();
+  }
+
+  StringRef getPassName() const override {
+    return "FunctionGroup Pass Manager";
+  }
+
+  PMDataManager *getAsPMDataManager() override { return this; }
+  Pass *getAsPass() override { return this; }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) override {
+    errs().indent(Offset * 2) << "FunctionGroup Pass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      Pass *P = getContainedPass(Index);
+      unsigned DumpOffset = Offset + 1;
+      // Pretend that there is no FGPassManager when we need to dump
+      // module pass indentation.
+      if (isModulePass(P))
+        DumpOffset -= 1;
+      P->dumpPassStructure(DumpOffset);
+      dumpLastUses(P, DumpOffset);
+    }
+  }
+
+  Pass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    return static_cast<Pass *>(PassVector[N]);
+  }
+
+  PassManagerType getPassManagerType() const override {
+    return PMT_ModulePassManager;
+  }
+
+private:
+  bool runPassesOnFunctionGroup(unsigned Begin, unsigned End, FunctionGroup &FG);
+  bool runPassOnFunctionGroup(Pass *P, FunctionGroup &FG);
+  bool doFGInitialization(unsigned Begin, unsigned End, FunctionGroupAnalysis &FGA);
+  bool doFGFinalization(unsigned Begin, unsigned End, FunctionGroupAnalysis &FGA);
+  bool runFGPassSequence(unsigned &Pass);
+  bool runModulePassSequence(unsigned &Pass, Module &M);
+};
+
+} // end anonymous namespace.
+
+char FGPassManager::ID = 0;
+
+bool FGPassManager::runPassOnFunctionGroup(Pass *P, FunctionGroup &FG) {
+  bool Changed = false;
+  llvm::PMDataManager *PM = P->getAsPMDataManager();
+
+  if (!PM) {
+    FunctionGroupPass *CGSP = (FunctionGroupPass *)P;
+    {
+      TimeRegion PassTimer(getPassTimer(CGSP));
+      Changed = CGSP->runOnFunctionGroup(FG);
+    }
+    return Changed;
+  }
+
+  // TODO: there may be also SCC pass manager.
+  assert(PM->getPassManagerType() == PMT_FunctionPassManager &&
+         "Invalid FGPassManager member");
+  FPPassManager *FPP = (FPPassManager *)P;
+
+  // Run pass P on all functions in the current FunctionGroup.
+  for (auto &F : FG) {
+    dumpPassInfo(P, EXECUTION_MSG, ON_FUNCTION_MSG, F->getName());
+    {
+      TimeRegion PassTimer(getPassTimer(FPP));
+      Changed |= FPP->runOnFunction(*F);
+    }
+    F->getContext().yield();
+  }
+  return Changed;
+}
+
+
+/// RunPassesOnFunctionGroup -  Execute sequential passes of pass manager
+/// on the specified FunctionGroup
+bool FGPassManager::runPassesOnFunctionGroup(unsigned Begin, unsigned End,
+                                             FunctionGroup &FG) {
+  bool Changed = false;
+
+  // Run selected passes on current FunctionGroup.
+  for (unsigned PassNo = Begin; PassNo != End; ++PassNo) {
+    Pass *P = getContainedPass(PassNo);
+    dumpRequiredSet(P);
+
+    initializeAnalysisImpl(P);
+
+    // Actually run this pass on the current FunctionGroup.
+    Changed |= runPassOnFunctionGroup(P, FG);
+    if (Changed)
+      dumpPassInfo(P, MODIFICATION_MSG, ON_MODULE_MSG, "");
+    dumpPreservedSet(P);
+
+    verifyPreservedAnalysis(P);
+    removeNotPreservedAnalysis(P);
+    recordAvailableAnalysis(P);
+    removeDeadPasses(P, "", ON_MODULE_MSG);
+  }
+
+  return Changed;
+}
+
+/// Initialize sequential FG passes
+bool FGPassManager::doFGInitialization(unsigned Begin, unsigned End,
+                                       FunctionGroupAnalysis &FGA) {
+  bool Changed = false;
+
+  for (unsigned i = Begin; i != End; ++i) {
+    if (llvm::PMDataManager *PM = getContainedPass(i)->getAsPMDataManager()) {
+      // TODO: SCC PassManager?
+      assert(PM->getPassManagerType() == PMT_FunctionPassManager &&
+        "Invalid FGPassManager member");
+      Changed |= ((FPPassManager*)PM)->doInitialization(*FGA.getModule());
+    } else {
+      Changed |=
+          ((FunctionGroupPass *)getContainedPass(i))->doInitialization(FGA);
+    }
+  }
+
+  return Changed;
+}
+
+/// Finalize sequential FG passes
+bool FGPassManager::doFGFinalization(unsigned Begin, unsigned End,
+                                     FunctionGroupAnalysis &FGA) {
+  bool Changed = false;
+
+  for (int i = End - 1; i >= static_cast<int>(Begin); --i) {
+    if (llvm::PMDataManager *PM = getContainedPass(i)->getAsPMDataManager()) {
+      // TODO: SCC PassManager?
+      assert(PM->getPassManagerType() == PMT_FunctionPassManager &&
+        "Invalid FGPassManager member");
+      Changed |= ((FPPassManager*)PM)->doFinalization(*FGA.getModule());
+    } else {
+      Changed |=
+          ((FunctionGroupPass *)getContainedPass(i))->doFinalization(FGA);
+    }
+  }
+
+  return Changed;
+}
+
+bool FGPassManager::runFGPassSequence(unsigned &Pass) {
+  const unsigned BeginPass = Pass;
+  const unsigned NumPasses = getNumContainedPasses();
+  while (Pass < NumPasses && !isModulePass(getContainedPass(Pass)))
+    ++Pass;
+
+  // Function group analysis may be invalidated by previous
+  // module passes so we will need to query it every time we
+  // execute sequence of passes.
+  FunctionGroupAnalysis &FGA = getAnalysis<FunctionGroupAnalysis>();
+  bool Changed = false;
+
+  Changed |= doFGInitialization(BeginPass, Pass, FGA);
+  for (auto *FG : FGA)
+    Changed |= runPassesOnFunctionGroup(BeginPass, Pass, *FG);
+  Changed |= doFGFinalization(BeginPass, Pass, FGA);
+
+  return Changed;
+}
+
+bool FGPassManager::runModulePassSequence(unsigned &Pass, Module &M) {
+  const unsigned BeginPass = Pass;
+  const unsigned NumPasses = getNumContainedPasses();
+  while (Pass < NumPasses && isModulePass(getContainedPass(Pass)))
+    ++Pass;
+
+  bool Changed = false;
+
+  // Copied from MPPassManager in LegacyPassManager.cpp.
+  unsigned InstrCount, ModuleCount = 0;
+  StringMap<std::pair<unsigned, unsigned>> FunctionToInstrCount;
+  bool EmitICRemark = M.shouldEmitInstrCountChangedRemark();
+  // Collect the initial size of the module.
+  if (EmitICRemark) {
+    InstrCount = initSizeRemarkInfo(M, FunctionToInstrCount);
+    ModuleCount = InstrCount;
+  }
+
+  for (unsigned Index = BeginPass; Index < Pass; ++Index) {
+    auto *MP = static_cast<ModulePass *>(getContainedPass(Index));
+    bool LocalChanged = false;
+
+    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier());
+    dumpRequiredSet(MP);
+
+    initializeAnalysisImpl(MP);
+
+    {
+      PassManagerPrettyStackEntry X(MP, M);
+      TimeRegion PassTimer(getPassTimer(MP));
+
+      LocalChanged |= MP->runOnModule(M);
+      if (EmitICRemark) {
+        // Update the size of the module.
+        ModuleCount = M.getInstructionCount();
+        if (ModuleCount != InstrCount) {
+          int64_t Delta = static_cast<int64_t>(ModuleCount) -
+            static_cast<int64_t>(InstrCount);
+          emitInstrCountChangedRemark(MP, M, Delta, InstrCount,
+            FunctionToInstrCount);
+          InstrCount = ModuleCount;
+        }
+      }
+    }
+
+    Changed |= LocalChanged;
+    if (LocalChanged)
+      dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
+        M.getModuleIdentifier());
+    dumpPreservedSet(MP);
+    dumpUsedSet(MP);
+
+    verifyPreservedAnalysis(MP);
+    removeNotPreservedAnalysis(MP);
+    recordAvailableAnalysis(MP);
+    removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
+  }
+
+  return Changed;
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool FGPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  unsigned CurPass = 0;
+  unsigned NumPasses = getNumContainedPasses();
+  while (CurPass != NumPasses) {
+    // We will always have chain of fg passes followed by
+    // module passes repeating until there are no passes.
+    Changed |= runFGPassSequence(CurPass);
+    Changed |= runModulePassSequence(CurPass, M);
+  }
+
+  return Changed;
+}
+
+bool FGPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  // Initialize module passes
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    auto *P = getContainedPass(Index);
+    if (isModulePass(P))
+      Changed |= P->doInitialization(M);
+  }
+
+  return Changed;
+}
+
+bool FGPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  // Finalize module passes
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index) {
+    auto *P = getContainedPass(Index);
+    if (isModulePass(P))
+      Changed |= P->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// FunctionGroupPass Implementation
+//===----------------------------------------------------------------------===//
+
+/// Assign pass manager to manage this pass.
+void FunctionGroupPass::assignPassManager(PMStack &PMS,
+                                          PassManagerType PreferredType) {
+  // Find module pass manager.
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_ModulePassManager)
+    PMS.pop();
+
+  assert(!PMS.empty() && "Unable to handle FunctionGroup Pass");
+  FGPassManager *GFP;
+  
+  // Check whether this ModulePassManager is our injected function
+  // group pass manager. If not, replace old module pass manager
+  // with one for function groups.
+  auto *PM = PMS.top();
+  assert(PM->getPassManagerType() == PMT_ModulePassManager &&
+         "Bad pass manager type for function group pass manager");
+  if (PM->getAsPass()->getPassID() == &FGPassManager::ID)
+    GFP = static_cast<FGPassManager *>(PM);
+  else {
+    // Create new FunctionGroup Pass Manager if it does not exist. 
+
+    // [1] Create new FunctionGroup Pass Manager
+    GFP = new FGPassManager();
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PM->getTopLevelManager();
+    TPM->addIndirectPassManager(GFP);
+    GFP->setTopLevelManager(TPM);
+
+    // [3] Assign manager to manage this new manager. This should not create
+    // and push new managers into PMS
+    TPM->schedulePass(GFP);
+    assert(PMS.top() == PM && "Pass manager unexpectedly changed");
+
+    // [4] Steal analysis info from module pass manager.
+    *GFP->getAvailableAnalysis() = std::move(*PM->getAvailableAnalysis());
+
+    // [5] Replace module pass manager with function group pass manager.
+    PMS.pop();
+    PMS.push(GFP);
+  }
+
+  GFP->add(this);
+}
+
+/// getAnalysisUsage - For this class, we declare that we require and preserve
+/// FunctionGroupAnalysis.  If the derived class implements this method, it
+/// should always explicitly call the implementation here.
+void FunctionGroupPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<FunctionGroupAnalysis>();
+  AU.addPreserved<FunctionGroupAnalysis>();
+}
+
+//===----------------------------------------------------------------------===//
+// PrintFunctionGroupPass Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// PrintFunctionGroupPass - Print a FunctionGroup
+///
+class PrintFunctionGroupPass : public FunctionGroupPass {
+  std::string Banner;
+  raw_ostream &Out; // raw_ostream to print on.
+public:
+  static char ID;
+  PrintFunctionGroupPass(const std::string &B, raw_ostream &o)
+      : FunctionGroupPass(ID), Banner(B), Out(o) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  bool runOnFunctionGroup(FunctionGroup &FG) override {
+    Out << Banner;
+    for (auto I = FG.begin(), E = FG.end(); I != E; ++I) {
+      Function *F = *I;
+      Out << Banner << static_cast<Value &>(*F);
+    }
+    return false;
+  }
+};
+} // end anonymous namespace.
+
+char PrintFunctionGroupPass::ID = 0;
+
+Pass *FunctionGroupPass::createPrinterPass(raw_ostream &O,
+                                           const std::string &Banner) const {
+  return new PrintFunctionGroupPass(Banner, O);
+}
+
+//===----------------------------------------------------------------------===//
+//  DominatorTreeGroupWrapperPass Implementation
+//===----------------------------------------------------------------------===//
+//
+// The implementation details of the wrapper pass that holds a DominatorTree
+// per Function in a FunctionGroup.
+//
+//===----------------------------------------------------------------------===//
+char DominatorTreeGroupWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DominatorTreeGroupWrapperPass, "groupdomtree",
+                      "Group Dominator Tree Construction", true, true)
+INITIALIZE_PASS_END(DominatorTreeGroupWrapperPass, "groupdomtree",
+                    "Group Dominator Tree Construction", true, true)
+
+void DominatorTreeGroupWrapperPass::releaseMemory() {
+  for (auto i = DTs.begin(), e = DTs.end(); i != e; ++i)
+    delete i->second;
+  DTs.clear();
+}
+
+bool DominatorTreeGroupWrapperPass::runOnFunctionGroup(FunctionGroup &FG) {
+  for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    auto DT = new DominatorTree;
+    DT->recalculate(*F);
+    DTs[F] = DT;
+  }
+  return false;
+}
+
+void DominatorTreeGroupWrapperPass::verifyAnalysis() const {
+  for (auto i = DTs.begin(), e = DTs.end(); i != e; ++i)
+    i->second->verify();
+}
+
+void DominatorTreeGroupWrapperPass::print(raw_ostream &OS,
+                                          const Module *) const {
+  for (auto i = DTs.begin(), e = DTs.end(); i != e; ++i)
+    i->second->print(OS);
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.h b/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.h
new file mode 100644
index 000000000000..e3639616ada9
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.h
@@ -0,0 +1,280 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// FunctionGroup
+/// -------------
+///
+/// FunctionGroup is a generic mechanism for maintaining a group of Functions.
+///
+/// FunctionGroupAnalysis is a Module analysis that maintains all the
+/// FunctionGroups in the Module. It is up to some other pass to use
+/// FunctionGroupAnalysis to create and populate the FunctionGroups, and thus
+/// attach some semantics to what a FunctionGroup represents.
+///
+/// FunctionGroupPass is a type of pass (with associated pass manager) that
+/// runs a pass instance per FunctionGroup.
+///
+/// This file is currently in lib/Target/GenX, as that is the only place it
+/// is used. It could be moved somewhere more general.
+///
+//===----------------------------------------------------------------------===//
+#ifndef FUNCTIONGROUP_H
+#define FUNCTIONGROUP_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+
+#include <list>
+
+namespace llvm {
+
+class FunctionGroupAnalysis;
+class LLVMContext;
+class PMStack;
+
+//----------------------------------------------------------------------
+// FunctionGroup : a group of Functions
+//
+class FunctionGroup {
+  FunctionGroupAnalysis *FGA;
+  // Vector of Functions in the FunctionGroup. Element 0 is the head.
+  // Elements are asserting value handles, so we spot when a Function
+  // in the group gets destroyed too early.
+  SmallVector<AssertingVH<Function>, 8> Functions;
+
+public:
+  FunctionGroup(FunctionGroupAnalysis *FGA) : FGA(FGA) {}
+  FunctionGroupAnalysis *getParent() { return FGA; }
+  // push_back : push a Function into the group. The first time this is done,
+  // the Function is the head Function.
+  void push_back(Function *F) { Functions.push_back(AssertingVH<Function>(F)); }
+  // iterator and forwarders. The iterator iterates over the Functions in the
+  // group, starting with the head Function.
+  AssertingVH<Function> &at(unsigned i) { return Functions[i]; }
+  typedef SmallVectorImpl<AssertingVH<Function>>::iterator iterator;
+  iterator begin() { return Functions.begin(); }
+  iterator end() { return Functions.end(); }
+  typedef SmallVectorImpl<AssertingVH<Function>>::reverse_iterator
+      reverse_iterator;
+  reverse_iterator rbegin() { return Functions.rbegin(); }
+  reverse_iterator rend() { return Functions.rend(); }
+  size_t size() { return Functions.size(); }
+  // accessors
+  Function *getHead() {
+    assert(size());
+    return *begin();
+  }
+  StringRef getName() { return getHead()->getName(); }
+  LLVMContext &getContext() { return getHead()->getContext(); }
+  Module *getModule() { return getHead()->getParent(); }
+};
+
+//----------------------------------------------------------------------
+// FunctionGroupAnalysis - a Module analysis that maintains all the
+// FunctionGroups in the Module. It is up to some other pass to use
+// FunctionGroupAnalysis to create the FunctionGroups and then populate them.
+//
+class FunctionGroupAnalysis : public ModulePass {
+public:
+  // FunctionGroup types:
+  // * GROUP - GENX_MAIN kernel and its underlying callgraph
+  // * SUBGROUP - GENX_STACKCALL function and its underlying callgraph including
+  //    subroutines only
+  // Groups are necessary to perform cloning of subroutines
+  // called from different kernels and/or stack functions
+  enum class FGType { GROUP, SUBGROUP, MAX };
+  const FGType TypesToProcess[static_cast<size_t>(FGType::MAX)] = {
+      FGType::GROUP, FGType::SUBGROUP};
+
+private:
+  Module *M;
+  SmallVector<FunctionGroup *, 8> Groups;
+
+  // storage for FunctionGroups that aren't of type GROUP,
+  // i.e. not necessarily GENX_MAIN headed
+  // TODO: mb increase 8 as there can be many stack funcs hence may subgroups
+  SmallVector<FunctionGroup *, 8> NonMainGroups;
+
+  class FGMap {
+    using ElementType = std::map<Function *, FunctionGroup *>;
+    ElementType data[static_cast<size_t>(FGType::MAX)];
+
+  public:
+    ElementType &operator[](FGType type) {
+      auto index = static_cast<size_t>(type);
+      return data[index];
+    }
+  };
+
+  FGMap GroupMap;
+  std::map<Function *, bool> Visited;
+  using CallGraph = std::map<Function *, std::list<Function *>>;
+
+public:
+  static char ID;
+  explicit FunctionGroupAnalysis() : ModulePass(ID) {}
+  ~FunctionGroupAnalysis() { clear(); }
+  virtual StringRef getPassName() const { return "function group analysis"; }
+  // runOnModule : does almost nothing
+  bool runOnModule(Module &ArgM) {
+    clear();
+    M = &ArgM;
+    return false;
+  }
+  // getModule : get the Module that this FunctionGroupAnalysis is for
+  Module *getModule() { return M; }
+  // clear : clear out the FunctionGroupAnalysis
+  void clear();
+  // getGroup : get the FunctionGroup containing Function F, else 0
+  FunctionGroup *getGroup(Function *F, FGType Type);
+  FunctionGroup *getGroup(Function *F);
+  FunctionGroup *getSubGroup(Function *F);
+  // getGroupForHead : get the FunctionGroup for which Function F is the
+  // head, else 0
+  FunctionGroup *getGroupForHead(Function *F);
+  // replaceFunction : replace a Function in a FunctionGroup
+  void replaceFunction(Function *OldF, Function *NewF);
+  // iterator for FunctionGroups in the analysis
+  typedef SmallVectorImpl<FunctionGroup *>::iterator iterator;
+  iterator begin() { return iterator(Groups.begin()); }
+  iterator end() { return iterator(Groups.end()); }
+  size_t size() { return Groups.size(); }
+  // addToFunctionGroup : add Function F to FunctionGroup FG
+  // Using this (rather than calling push_back directly on the FunctionGroup)
+  // means that the mapping from F to FG will be created, and getGroup() will
+  // work for this Function.
+  void addToFunctionGroup(FunctionGroup *FG, Function *F, FGType Type);
+  // createFunctionGroup : create new FunctionGroup for which F is the head
+  FunctionGroup *createFunctionGroup(Function *F, FGType Type);
+  bool buildGroup(CallGraph &callees, Function *F,
+                  FunctionGroup *curGr = nullptr, FGType Type = FGType::GROUP);
+
+  void clearVisited() { Visited.clear(); }
+};
+
+ModulePass *createFunctionGroupAnalysisPass();
+void initializeFunctionGroupAnalysisPass(PassRegistry &);
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const FunctionGroupAnalysis::FGType &T) {
+  switch (T) {
+  case FunctionGroupAnalysis::FGType::GROUP:
+    OS << "Group";
+    break;
+  case FunctionGroupAnalysis::FGType::SUBGROUP:
+    OS << "Subgroup";
+    break;
+  default:
+    llvm_unreachable("Invalid FG type");
+    break;
+  }
+  return OS;
+}
+
+//----------------------------------------------------------------------
+// FunctionGroupPass - a type of pass (with associated pass manager) that
+// runs a pass instance per FunctionGroup.
+//
+class FunctionGroupPass : public Pass {
+public:
+  static constexpr unsigned PassType = PT_PassManager + 1;
+
+  explicit FunctionGroupPass(char &pid) : Pass(static_cast<PassKind>(PassType), pid) {}
+
+  // createPrinterPass - Get a pass that prints the Module
+  // corresponding to a FunctionGroupAnalysis.
+  Pass *createPrinterPass(raw_ostream &O,
+                          const std::string &Banner) const override;
+
+  using llvm::Pass::doFinalization;
+  using llvm::Pass::doInitialization;
+
+  // doInitialization - This method is called before the FunctionGroups of the
+  // program have been processed, allowing the pass to do initialization as
+  // necessary.
+  virtual bool doInitialization(FunctionGroupAnalysis &FGA) { return false; }
+
+  // runOnFunctionGroup - This method should be implemented by the subclass to
+  // perform whatever action is necessary for the specified FunctionGroup.
+  //
+  virtual bool runOnFunctionGroup(FunctionGroup &FG) = 0;
+
+  // doFinalization - This method is called after the FunctionGroups of the
+  // program have been processed, allowing the pass to do final cleanup as
+  // necessary.
+  virtual bool doFinalization(FunctionGroupAnalysis &FGA) { return false; }
+
+  // Assign pass manager to manager this pass
+  void assignPassManager(PMStack &PMS, PassManagerType PMT) override;
+
+  //  Return what kind of Pass Manager can manage this pass.
+  PassManagerType getPotentialPassManagerType() const override {
+    return PMT_ModulePassManager;
+  }
+
+  // getAnalysisUsage - For this class, we declare that we require and
+  // preserve the FunctionGroupAnalysis.
+  // If the derived class implements this method, it should
+  // always explicitly call the implementation here.
+  void getAnalysisUsage(AnalysisUsage &Info) const override;
+};
+
+//----------------------------------------------------------------------
+// DominatorTreeGroupWrapperPass : Analysis pass which computes a DominatorTree
+// per Function in the FunctionGroup.
+class DominatorTree;
+
+class DominatorTreeGroupWrapperPass : public FunctionGroupPass {
+  std::map<Function *, DominatorTree *> DTs;
+
+public:
+  static char ID;
+
+  DominatorTreeGroupWrapperPass() : FunctionGroupPass(ID) {}
+  ~DominatorTreeGroupWrapperPass() { releaseMemory(); }
+
+  DominatorTree *getDomTree(Function *F) { return DTs[F]; }
+  const DominatorTree &getDomTree();
+
+  bool runOnFunctionGroup(FunctionGroup &FG) override;
+
+  void verifyAnalysis() const override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionGroupPass::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+
+  void releaseMemory() override;
+
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+};
+void initializeDominatorTreeGroupWrapperPassPass(PassRegistry &);
+
+} // end namespace llvm
+#endif // ndef FUNCTIONGROUP_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenX.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenX.h
new file mode 100644
index 000000000000..3c8b42037f67
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenX.h
@@ -0,0 +1,157 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef TARGET_GENX_H
+#define TARGET_GENX_H
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include <string>
+
+namespace llvm {
+
+class BasicBlock;
+class CallInst;
+class Constant;
+class DebugLoc;
+class DominatorTree;
+class formatted_raw_ostream;
+class Function;
+class FunctionGroup;
+class FunctionGroupPass;
+class FunctionPass;
+class GenXSubtarget;
+class Instruction;
+class MDNode;
+class ModulePass;
+class ShuffleVectorInst;
+class TargetOptions;
+class Twine;
+class Value;
+class raw_ostream;
+class raw_pwrite_stream;
+
+enum BalingKind {
+  BK_Legalization, // build baling info for legalization
+  BK_CodeGen,      // build baling info for the final vISA emission
+  BK_Analysis,     // build baling info for analysis (register pressure)
+};
+
+FunctionPass *createGenXPrinterPass(raw_ostream &O, const std::string &Banner);
+FunctionGroupPass *createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner);
+FunctionPass *createGenXAnalysisDumperPass(FunctionPass *Analysis, const char *Suffix);
+FunctionGroupPass *createGenXGroupAnalysisDumperPass(FunctionGroupPass *Analysis, const char *Suffix);
+
+FunctionPass *createGenXCFSimplificationPass();
+ModulePass *createGenXEarlySimdCFConformancePass();
+FunctionPass *createGenXReduceIntSizePass();
+FunctionPass *createGenXInstCombineCleanup();
+FunctionPass *createGenXInlineAsmLoweringPass();
+FunctionPass *createGenXLoweringPass();
+FunctionPass *createGenXLowerAggrCopiesPass();
+FunctionPass *createGenXGEPLoweringPass();
+FunctionPass *createGenXRegionCollapsingPass();
+FunctionPass *createGenXExtractVectorizerPass();
+FunctionPass *createGenXRawSendRipperPass();
+FunctionPass *createGenXFuncBalingPass(BalingKind Kind, GenXSubtarget *ST);
+FunctionPass *createGenXLegalizationPass();
+ModulePass *createGenXEmulatePass();
+FunctionPass *createGenXDeadVectorRemovalPass();
+FunctionPass *createGenXPatternMatchPass(const TargetOptions *Options);
+FunctionPass *createGenXPostLegalizationPass();
+FunctionPass *createTransformPrivMemPass();
+ModulePass *createGenXThreadPrivateMemoryPass();
+FunctionPass *createGenXPromotePredicatePass();
+FunctionPass *createGenXIMadPostLegalizationPass();
+FunctionPass *createGenXAggregatePseudoLoweringPass();
+ModulePass *createGenXModulePass();
+FunctionGroupPass *createGenXLateSimdCFConformancePass();
+FunctionGroupPass *createGenXLivenessPass();
+ModulePass *createGenXFunctionPointersLoweringPass();
+FunctionGroupPass *createGenXCategoryPass();
+FunctionGroupPass *createGenXGroupBalingPass(BalingKind Kind, GenXSubtarget *ST);
+FunctionGroupPass *createGenXUnbalingPass();
+FunctionGroupPass *createGenXDepressurizerPass();
+FunctionGroupPass *createGenXLateLegalizationPass();
+FunctionGroupPass *createGenXNumberingPass();
+FunctionGroupPass *createGenXLiveRangesPass();
+FunctionGroupPass *createGenXRematerializationPass();
+FunctionGroupPass *createGenXCoalescingPass();
+FunctionGroupPass *createGenXAddressCommoningPass();
+FunctionGroupPass *createGenXArgIndirectionPass();
+FunctionPass *createGenXTidyControlFlowPass();
+FunctionGroupPass *createGenXVisaRegAllocPass();
+FunctionGroupPass *createGenXVisaFuncWriterPass();
+FunctionGroupPass *createGenXCisaBuilderPass();
+ModulePass *createGenXFinalizerPass(raw_pwrite_stream &o);
+ModulePass *createGenXVisaWriterPass(raw_pwrite_stream &o);
+
+namespace genx {
+
+// A local encoding (not part of vISA or GenX) of whether an operand should be signed.
+enum Signedness {
+  DONTCARESIGNED = 3, SIGNED = 1, UNSIGNED = 2
+};
+
+const constexpr int BoolBits  = 1;
+const constexpr int ByteBits  = 8;
+const constexpr int WordBits  = 16;
+const constexpr int DWordBits = 32;
+const constexpr int QWordBits = 64;
+const constexpr int GRFBits = 256;
+
+const constexpr int ByteBytes = ByteBits / ByteBits;
+const constexpr int WordBytes = WordBits / ByteBits;
+const constexpr int DWordBytes = DWordBits / ByteBits;
+const constexpr int QWordBytes = QWordBits / ByteBits;
+
+// vISA allows [-512,511] for operation to be baled as offset
+// for rdregion, copied from visa
+const constexpr int G4_MAX_ADDR_IMM = 511;
+const constexpr int G4_MIN_ADDR_IMM = -512;
+
+// describe integer vector immediate (V, UV)
+enum ImmIntVec {
+  Width = 8, // num elem in vector
+  ElemSize = 4, // in bits
+  MaxUInt = (1 << ElemSize) - 1,
+  MinUInt = 0,
+  MaxSInt = (1 << (ElemSize - 1)) - 1,
+  MinSInt = -(1 << (ElemSize - 1))
+};
+
+} // End genx namespace
+} // End llvm namespace
+
+#endif
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenX.td b/IGC/VectorCompiler/lib/GenXCodeGen/GenX.td
new file mode 100644
index 000000000000..93775e9e14bb
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenX.td
@@ -0,0 +1,87 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This is a target description file for the Intel Gen architecture, referred
+// to here as the "GenX" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// GenX Subtarget features - these are typically passed in as features
+//===----------------------------------------------------------------------===//
+
+def DumpRegAlloc: SubtargetFeature<"dump_regalloc", "DumpRegAlloc",
+                                   "true", "dump regalloc information">;
+
+//===----------------------------------------------------------------------===//
+// GenX Subtarget state - these are typically inferred from the Proc
+//===----------------------------------------------------------------------===//
+
+def FeatureLongLong : SubtargetFeature<"longlong","HasLongLong", "true",
+                                       "supports long long">;
+
+def FeatureNoJmpi : SubtargetFeature<"disable_jmpi", "DisableJmpi",
+                                       "true", "disable jmpi">;
+
+def FeatureVectorDecomp : SubtargetFeature<"disable_vec_decomp",
+                                           "DisableVectorDecomposition",
+                                           "true",
+                                           "disable vector decomposition pass">;
+
+def WarnCallable : SubtargetFeature<"warn_callable", "WarnCallable",
+                                    "true", "warn instead of error on callable violation">;
+
+def OCLRuntime : SubtargetFeature<"ocl_runtime", "OCLRuntime", "true",
+                                  "Prepare structures for OCL runtime">;
+
+
+//===----------------------------------------------------------------------===//
+// GenX processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"HSW",             []>;
+def : Proc<"BDW",             [FeatureLongLong]>;
+def : Proc<"CHV",             [FeatureLongLong]>;
+def : Proc<"SKL",             [FeatureLongLong]>;
+def : Proc<"BXT",             [FeatureLongLong]>;
+def : Proc<"KBL",             [FeatureLongLong]>;
+def : Proc<"GLK",             [FeatureLongLong]>;
+def : Proc<"CNL",             [FeatureLongLong]>;
+def : Proc<"ICL",             [FeatureLongLong]>;
+def : Proc<"ICLLP",           []>;
+def : Proc<"TGLLP",           []>;
+
+def GenX : Target {
+  // Nothing here (yet?)
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAddressCommoning.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAddressCommoning.cpp
new file mode 100644
index 000000000000..7bafdc398ab3
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAddressCommoning.cpp
@@ -0,0 +1,1047 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXAddressCommoning
+/// --------------------
+///
+/// This pass spots when multiple address conversions use the same value and
+/// are used in regions with the same base register (the same coalesced live
+/// range), and commons up the address conversions.
+///
+/// It also handles cases where an llvm.genx.add.addr has an out of range offset
+/// that is not encodable as the constant offset in an indirect operand. When
+/// commoning up address conversions, it groups ones with nearby offsets such
+/// that all uses of a commoned address conversion have in range offsets in
+/// their llvm.genx.add.addr ops.
+///
+/// Before this pass, GenXCategoryConversion has ensured that each use of a
+/// variable index in an element or region access (llvm.genx.rdregion etc
+/// intrinsics) has its own separate address conversion (llvm.genx.convert.addr
+/// intrinsic). Any constant add/sub between the address conversion
+/// and the use of the variable index has been turned into an llvm.genx.add.addr
+/// intrinsic.
+///
+/// This GenXAddressCommoning pass spots when multiple address conversions
+/// use the same index value as input and are used in element/region accesses
+/// with the same base register. These can then be commoned up.
+///
+/// In fact, rather than looking at an address conversion in isolation, it needs
+/// to look at the whole bale containing the address conversion, which might have
+/// a baled in rdregion and modifiers. It needs to do this because
+/// GenXBaling cloned the rdregion and modifiers, so they need commoning up
+/// again with the address conversion.
+/// This situation is common because GenXLowering lowers a trunc (as often
+/// found in an index calculation to convert the index to i16) into a bitcast
+/// and a rdregion.
+///
+/// A second transformation in this pass is the "histogram optimization": If
+/// there are multiple scalar address conversions for the same base reg where
+/// each index is an extract (a scalar rdregion) from the same index vector, we
+/// attempt to common them up into a vector address conversion, with an extract
+/// from the result of the vector address conversion for each user of an
+/// original scalar address conversion. The extract is baled in to the indirect
+/// region, appearing as the "addr_offset" field (the index into the 8 wide
+/// address register) in the generated vISA.
+///
+/// This histogram optimization uses the hasIndirectGRFCrossing feature from
+/// GenXSubtarget to tell how big the combined vector address conversion can be,
+/// in the case that it itself is an indirect region.
+///
+/// Both of the transformations in this pass are fiddly because the pass runs so
+/// late. It has to run this late because we cannot tell whether address
+/// conversions can be commoned up until GenXCoalescing has decided which vectors
+/// are in the same register, but that then means that this pass has to update
+/// live ranges and baling info for the code that it modifies.
+///
+/// **IR restriction**: After this pass, the restrictions on
+/// ``llvm.genx.convert.addr`` and ``llvm.genx.add.addr`` having just a single
+/// use are relaxed. Now, multiple uses of ``llvm.genx.convert.addr``, possibly
+/// each via a single ``llvm.genx.add.addr``, must be in rdregions/wrregions
+/// where the base register is provably the same because all the values that
+/// appear as the "old value" input are coalesced together into the same
+/// LiveRange.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_ADDRESSCOMMONING"
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXGotoJoin.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "GenXNumbering.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "vc/GenXOpts/Utils/RegCategory.h"
+#include "llvm-c/Core.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+
+static cl::opt<bool> ConvertAfterWholeRegion("convert-after-whole", cl::init(true), cl::Hidden,
+  cl::desc("Convert addrs after whole region conversion attempt"));
+
+namespace {
+
+// Bucket : a bucket for collecting address conversions with the same base reg
+// and the same address calculation value, discarding duplicates.
+struct Bucket {
+  SmallVector<Instruction *, 4> Addrs;
+  SmallSet<Instruction *, 4> AddrSet;
+  void add(Instruction *Addr) {
+    if (AddrSet.insert(Addr).second)
+      Addrs.push_back(Addr);
+  }
+};
+
+// ExtractBucket : a bucket for collecting address conversions with the same
+// base reg that all use an extract (scalar rdregion) from the same vector,
+// discarding duplicates.
+struct ExtractBucket {
+  SmallVector<Instruction *, 4> Addrs;
+  SmallSet<Instruction *, 4> AddrSet;
+  void add(Instruction *Addr) {
+    if (AddrSet.insert(Addr).second)
+      Addrs.push_back(Addr);
+  }
+};
+
+// Extract: address and offset for region conversion
+struct Extract {
+  Instruction *Addr; // the address conversion instruction
+  int Offset; // the offset from the rdregion;
+  Extract(Instruction *Addr, int Offset) : Addr(Addr), Offset(Offset) {}
+  bool operator<(const Extract &Other) const { return Offset < Other.Offset; }
+};
+
+// GenX address conversion pass
+class GenXAddressCommoning : public FunctionGroupPass {
+  GenXBaling *Baling;
+  GenXLiveness *Liveness;
+  GenXNumbering *Numbering;
+  const GenXSubtarget *ST;
+  Function *F;
+  SmallSet<Value *, 8> AlreadyProcessed;
+  // Types and data structures used for gathering convert_addr ops that
+  // could be commoned up:
+  // InnerVec is a vector of convert_addr ops that have the same base register
+  // and bale hash. OuterVec is a vector of InnerVec. OuterMap provides a
+  // way of finding the element of OuterVec for a particular base register
+  // and bale hash. Using a vector and map together like this ensures that
+  // we process everything in the same order even as pointer values and hashes
+  // change from one compiler run to another.
+  typedef SmallVector<Instruction *, 4> InnerVec_t;
+  typedef SmallVector<InnerVec_t, 4> OuterVec_t;
+  OuterVec_t OuterVec;
+  struct BaseRegAndBaleHash {
+    LiveRange *BaseReg;
+    hash_code BaleHash;
+    BaseRegAndBaleHash(LiveRange *BaseReg, hash_code BaleHash)
+      : BaseReg(BaseReg), BaleHash(BaleHash) {}
+    static bool less(BaseRegAndBaleHash BRH1, BaseRegAndBaleHash BRH2)
+    {
+      if (BRH1.BaseReg != BRH2.BaseReg)
+        return BRH1.BaseReg < BRH2.BaseReg;
+      return BRH1.BaleHash < BRH2.BaleHash;
+    }
+  };
+  typedef std::map<BaseRegAndBaleHash, unsigned,
+          bool (*)(BaseRegAndBaleHash, BaseRegAndBaleHash)> OuterMap_t;
+  OuterMap_t OuterMap;
+public:
+  static char ID;
+  explicit GenXAddressCommoning() : FunctionGroupPass(ID),
+      OuterMap(OuterMap_t(BaseRegAndBaleHash::less)) { }
+  virtual StringRef getPassName() const { return "GenX address commoning"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    FunctionGroupPass::getAnalysisUsage(AU);
+    AU.addRequired<DominatorTreeGroupWrapperPass>();
+    AU.addRequired<GenXModule>();
+    AU.addRequired<GenXGroupBaling>();
+    AU.addRequired<GenXLiveness>();
+    AU.addRequired<GenXNumbering>();
+    AU.addPreserved<DominatorTreeGroupWrapperPass>();
+    AU.addPreserved<GenXGroupBaling>();
+    AU.addPreserved<GenXLiveness>();
+    AU.addPreserved<GenXModule>();
+    AU.addPreserved<GenXNumbering>();
+    AU.addPreserved<FunctionGroupAnalysis>();
+    AU.setPreservesCFG();
+  }
+  bool runOnFunctionGroup(FunctionGroup &FG);
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const
+  { return createGenXGroupPrinterPass(O, Banner); }
+private:
+  bool processFunction(Function *F);
+  bool processBaseReg(LiveRange *LR);
+  bool processCommonAddrs(ArrayRef<Instruction *> Addrs);
+  void processCommonAddrsWithValidOffsets(ArrayRef<Instruction *> Addrs);
+  bool vectorizeAddrs(LiveRange *LR);
+  void addAddrConvIfExtract(std::map<std::pair<Value *, int>, ExtractBucket> *ExtractBuckets, Value *Index);
+  bool tryConvertWholeRegion(SmallVector<Extract, 4> &Extracts,
+                             Instruction *VecDef);
+  bool vectorizeAddrsFromOneVector(ArrayRef<Instruction *> Addrs);
+  DominatorTree *getDominatorTree();
+  bool isValueInCurrentFunc(Value *V);
+};
+
+} // end anonymous namespace
+
+char GenXAddressCommoning::ID = 0;
+namespace llvm { void initializeGenXAddressCommoningPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXAddressCommoning, "GenXAddressCommoning", "GenXAddressCommoning", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_DEPENDENCY(GenXNumbering)
+INITIALIZE_PASS_END(GenXAddressCommoning, "GenXAddressCommoning", "GenXAddressCommoning", false, false)
+
+FunctionGroupPass *llvm::createGenXAddressCommoningPass()
+{
+  initializeGenXAddressCommoningPass(*PassRegistry::getPassRegistry());
+  return new GenXAddressCommoning();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run the address commoning pass for this
+ *    FunctionGroup
+ */
+bool GenXAddressCommoning::runOnFunctionGroup(FunctionGroup &FG)
+{
+  Baling = &getAnalysis<GenXGroupBaling>();
+  Liveness = &getAnalysis<GenXLiveness>();
+  Numbering = &getAnalysis<GenXNumbering>();
+  ST = getAnalysis<GenXModule>().getSubtarget();
+  bool Modified = false;
+  for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi) {
+    F = *fgi;
+    Modified |= processFunction(F);
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * processFunction : process one function in the address commoning pass
+ */
+bool GenXAddressCommoning::processFunction(Function *F)
+{
+  // Build a list of base registers used in an indirect rdregion or wrregion.
+  // This does a preordered depth first traversal of the CFG to
+  // ensure that we see a def before its uses (ignoring phi node uses).
+  // Because an llvm.genx.convert.addr intrinsic can bale in a rdregion
+  // with a variable index that itself uses an llvm.genx.convert.addr,
+  // we want to process the code in forward order so that we can do as
+  // much commoning as possible.
+  SmallVector<LiveRange *, 8> BaseRegs;
+  std::set<LiveRange *> BaseRegsSet;
+  for (df_iterator<BasicBlock *> i = df_begin(&F->getEntryBlock()),
+      e = df_end(&F->getEntryBlock()); i != e; ++i) {
+    for (auto bi = i->begin(), be = i->end(); bi != be; ++bi) {
+      Instruction *Inst = &*bi;
+      LiveRange *LR = nullptr;
+      switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+        default:
+          continue;
+        case GenXIntrinsic::genx_rdregioni:
+        case GenXIntrinsic::genx_rdregionf:
+          if (isa<Constant>(Inst->getOperand(
+                  GenXIntrinsic::GenXRegion::RdIndexOperandNum)))
+            continue;
+          LR = Liveness->getLiveRange(Inst->getOperand(0));
+          break;
+        case GenXIntrinsic::genx_wrregioni:
+        case GenXIntrinsic::genx_wrregionf:
+          if (isa<Constant>(Inst->getOperand(
+                  GenXIntrinsic::GenXRegion::WrIndexOperandNum)))
+            continue;
+          // A write region may be baled into a g_store.
+          LR = Liveness->getLiveRangeOrNull(Inst);
+          if (!LR) {
+            assert(Inst->hasOneUse());
+            auto SI = dyn_cast<StoreInst>(Inst->user_back());
+            if (!SI)
+              continue;
+            Value *GV = getUnderlyingGlobalVariable(SI->getPointerOperand());
+            if (!GV)
+              continue;
+            LR = Liveness->getLiveRange(GV);
+          }
+          break;
+      }
+      // Inst is rdregion or wrregion with non-constant index.
+      // Save the base register.
+      if (BaseRegsSet.insert(LR).second)
+        BaseRegs.push_back(LR); // not seen before
+    }
+  }
+  BaseRegsSet.clear();
+  // Process each base register.
+  bool Modified = false;
+  for (auto i = BaseRegs.begin(), e = BaseRegs.end(); i != e; ++i) {
+    Modified |= processBaseReg(*i);
+    Modified |= vectorizeAddrs(*i);
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * processBaseReg : process one base register
+ *
+ * Enter:   LR = LiveRange with all the values for this base register
+ *
+ * We common up all address conversions with the same input that are used to
+ * address a region of this base register.
+ */
+bool GenXAddressCommoning::processBaseReg(LiveRange *LR)
+{
+  // Gather the address conversions used by regions of this base register into
+  // buckets, one for each distinct input. A bucket discards duplicate address
+  // conversions.
+  std::map<Bale, Bucket> Buckets;
+  for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) {
+    Value *V = vi->getValue();
+    // Ignore the value if it is in the wrong function. That can happen because
+    // liveness information is shared between functions in the same group.
+    if (!isValueInCurrentFunc(V))
+      continue;
+    // First the def, if it is a wrregion.
+    if (GenXIntrinsic::isWrRegion(V)) {
+      Value *Index = cast<Instruction>(V)->getOperand(
+          GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+      while (GenXIntrinsic::getGenXIntrinsicID(Index) ==
+             GenXIntrinsic::genx_add_addr)
+        Index = cast<Instruction>(Index)->getOperand(0);
+      if (GenXIntrinsic::getGenXIntrinsicID(Index) ==
+          GenXIntrinsic::genx_convert_addr) {
+        Bale B;
+        Baling->buildBale(cast<Instruction>(Index), &B);
+        B.hash();
+        Buckets[B].add(cast<Instruction>(Index));
+      }
+    }
+    // Then each use that is a rdregion. (A use that is a wrregion will be
+    // handled when we look at that value, which must be coalesced into the
+    // same live range.)
+    for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) {
+      if (ui->getOperandNo() != GenXIntrinsic::GenXRegion::OldValueOperandNum)
+        continue;
+      auto user = cast<Instruction>(ui->getUser());
+
+      auto isBaledWrr = [=]() {
+        if (!isa<LoadInst>(V) || !GenXIntrinsic::isWrRegion(user) || !user->hasOneUse())
+          return false;
+        StoreInst *SI = dyn_cast<StoreInst>(user->user_back());
+        GlobalVariable *GV =
+            SI ? getUnderlyingGlobalVariable(SI->getPointerOperand()) : nullptr;
+        if (!GV)
+          return false;
+        // make sure the base is the right global variable.
+        return Liveness->getLiveRangeOrNull(GV) == LR;
+      };
+
+      // wrr may have been baled with a g_store.
+      if (isBaledWrr()) {
+        Value *Index = cast<Instruction>(user)->getOperand(
+            GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+        while (GenXIntrinsic::getGenXIntrinsicID(Index) ==
+               GenXIntrinsic::genx_add_addr)
+          Index = cast<Instruction>(Index)->getOperand(0);
+        if (GenXIntrinsic::getGenXIntrinsicID(Index) ==
+            GenXIntrinsic::genx_convert_addr) {
+          Bale B;
+          Baling->buildBale(cast<Instruction>(Index), &B);
+          B.hash();
+          Buckets[B].add(cast<Instruction>(Index));
+        }
+      }
+
+      if (!GenXIntrinsic::isRdRegion(user))
+        continue;
+      Value *Index = user->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum);
+      while (GenXIntrinsic::getGenXIntrinsicID(Index) ==
+             GenXIntrinsic::genx_add_addr)
+        Index = cast<Instruction>(Index)->getOperand(0);
+      if (GenXIntrinsic::getGenXIntrinsicID(Index) ==
+          GenXIntrinsic::genx_convert_addr) {
+        Bale B;
+        Baling->buildBale(cast<Instruction>(Index), &B);
+        B.hash();
+        Buckets[B].add(cast<Instruction>(Index));
+      }
+    }
+  }
+  // Common up each bucket with more than one address conversion.
+  bool Modified = false;
+  for (auto i = Buckets.begin(), e = Buckets.end(); i != e; ++i)
+    Modified |= processCommonAddrs(i->second.Addrs);
+  return Modified;
+}
+
+/***********************************************************************
+ * processCommonAddrs : common up some address conversions
+ *
+ * Enter:   Addrs = one or more address conversion instructions that all have
+ *          the same input and address the same base register, with no
+ *          duplicates. Offsets (in add.addr intrinsics) are not known to
+ *          be in range; this function fixes that.
+ *
+ * Return:  whether code modified
+ *
+ * This function relies on there being no duplicates in Addrs in the way that
+ * it erases the address conversions other than the one it uses as the common
+ * one.
+ *
+ * This processes a batch of address conversions with add.addr offsets close
+ * enough to each other that we can use constant offsets in the indirect
+ * operands. Then it recursively calls itself with what is left after removing
+ * that batch.
+ *
+ * This code relies on there only being one add.addr between a convert.addr and
+ * the use of the added address in a rdregion/wrregion. GenXCategory ensures
+ * that this is the case.
+ */
+bool GenXAddressCommoning::processCommonAddrs(ArrayRef<Instruction *> Addrs)
+{
+#ifndef NDEBUG
+  // Assert that we do not have any duplicates, and that they are all in the
+  // current function.
+  {
+    std::set<Instruction *> AddrSet;
+    for (auto i = Addrs.begin(), e = Addrs.end(); i != e; ++i) {
+      assert(AddrSet.insert(*i).second);
+      assert((*i)->getParent()->getParent() == F);
+    }
+  }
+#endif
+  bool Modified = false;
+  // Get the offsets. (Each address conversion has only one use; that is how
+  // GenXCategory set it up.)
+  SmallVector<int, 4> Offsets;
+  for (unsigned i = 0, e = Addrs.size(); i != e; ++i) {
+    int Offset = 0;
+    assert(Addrs[i]->hasOneUse());
+    auto AddrUse = cast<Instruction>(Addrs[i]->use_begin()->getUser());
+    if (GenXIntrinsic::getGenXIntrinsicID(AddrUse) ==
+        GenXIntrinsic::genx_add_addr) {
+      // The offset is operand 1 of the add_addr, and it is either a constant
+      // int or a splat of a constant int.
+      auto C = cast<Constant>(AddrUse->getOperand(1));
+      if (isa<VectorType>(C->getType()))
+        C = C->getSplatValue();
+      Offset = cast<ConstantInt>(C)->getSExtValue();
+    }
+    Offsets.push_back(Offset);
+  }
+  // Get the min offset.
+  int MinOffset = INT_MAX;
+  for (unsigned i = 0, e = Offsets.size(); i != e; ++i)
+    MinOffset = std::min(MinOffset, Offsets[i]);
+  // Split the address conversions into ones used with an offset in
+  // [MinOffset,MinOffset+1023] and ones that are outside that range.
+  SmallVector<Instruction *, 4> InRangeAddrs;
+  SmallVector<Instruction *, 4> OutOfRangeAddrs;
+  int MaxOffset = INT_MIN;
+  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+    if (Offsets[i] < MinOffset + 1024) {
+      InRangeAddrs.push_back(Addrs[i]);
+      MaxOffset = std::max(MaxOffset, Offsets[i]);
+    } else
+      OutOfRangeAddrs.push_back(Addrs[i]);
+  }
+  // We handle the ones in range here.
+  // The address conversions are going to be commoned up. Decide what offset we
+  // are going to put on the commoned up one. We ensure that the offset is
+  // inside the range of offsets that we found in the uses of the address
+  // conversions, to try and avoid the situation where the address conversion
+  // generates an out-of-range value in the address register that is then
+  // brought back into range by the immediate offset in each use of the address
+  int CommonOffset = 0;
+  if (MinOffset < 0) {
+    if (MaxOffset < 0) {
+      // All offsets are negative. Use 0 if that is in range, else as close to
+      // the max end of the offset range as we can get, rounded down to a
+      // multiple of 32.
+      if (MinOffset < G4_MIN_ADDR_IMM)
+        CommonOffset = std::min(MinOffset + 512, MaxOffset) & -32;
+    } else {
+      // Some negative and some non-negative. Common offset can be 0.
+      CommonOffset = 0;
+    }
+  } else {
+    // All offsets are non-negative. Use 0 if that is in range, else as close
+    // to the min end of the offsets range as we can get, rounded up to a
+    // multiple of 32.
+    if (MaxOffset >= 512)
+      CommonOffset = (std::max(MaxOffset - 511, MinOffset) + 31) & -32;
+  }
+  if (CommonOffset) {
+    Modified = true;
+    // Modify the address conversions to use the common offset, and adjust the
+    // address adds accordingly.
+    auto CommonOffsetVal = ConstantInt::get(InRangeAddrs[0]->getType()
+        ->getScalarType(), CommonOffset);
+    for (unsigned i = 0, e = InRangeAddrs.size(); i != e; ++i) {
+      Instruction *Addr = InRangeAddrs[i];
+      Addr->setOperand(1, CommonOffsetVal);
+      Use *U = &*Addr->use_begin();
+      auto *AddAddr = cast<Instruction>(U->getUser());
+      int AdjustedOffset = -CommonOffset;
+      if (GenXIntrinsic::getGenXIntrinsicID(AddAddr) ==
+          GenXIntrinsic::genx_add_addr) {
+        auto ThisOffsetC = cast<Constant>(AddAddr->getOperand(1));
+        if (isa<VectorType>(ThisOffsetC->getType()))
+          ThisOffsetC = ThisOffsetC->getSplatValue();
+        AdjustedOffset += cast<ConstantInt>(ThisOffsetC) ->getSExtValue();
+      } else if (AdjustedOffset) {
+        // We don't have an add_addr. We need to insert one.
+        Constant *C = ConstantInt::get(CommonOffsetVal->getType(),
+              AdjustedOffset);
+        if (auto VT = dyn_cast<VectorType>(Addr->getType()))
+          C = ConstantVector::getSplat(VT->getNumElements(), C);
+        auto CI = createAddAddr(Addr, C,
+            Addr->getName() + ".addaddr", AddAddr);
+        *U = CI;
+        AddAddr = CI;
+      } else
+        AddAddr = nullptr;
+      if (AddAddr) {
+        // Adjust the offset on the add_addr. The offset is operand 1 of the
+        // add_addr, and it is either a constant int or a splat of a constant
+        // int.
+        Constant *C = ConstantInt::get(CommonOffsetVal->getType(),
+              AdjustedOffset);
+        if (auto VT = dyn_cast<VectorType>(AddAddr->getOperand(1)->getType()))
+          C = ConstantVector::getSplat(VT->getNumElements(), C);
+        AddAddr->setOperand(1, C);
+        // Ensure the add_addr is baled in to the rdregion/wrregion that uses
+        // it. (It was not if we have just created it, or if its offset was out
+        // of range.) Also remove its live range.
+        assert(AddAddr->hasOneUse());
+        auto User = cast<Instruction>(AddAddr->use_begin()->getUser());
+        assert(GenXIntrinsic::isRdRegion(User) || GenXIntrinsic::isWrRegion(User));
+        auto BI = Baling->getBaleInfo(User);
+        BI.setOperandBaled(AddAddr->use_begin()->getOperandNo());
+        Baling->setBaleInfo(User, BI);
+        Liveness->eraseLiveRange(AddAddr);
+      }
+    }
+  }
+  // Now we can actually common up the in range addresses, if more than one of
+  // them.
+  if (InRangeAddrs.size() > 1) {
+    Modified = true;
+    processCommonAddrsWithValidOffsets(InRangeAddrs);
+  }
+  // Call recursively to process the remaining (out of range) ones.
+  if (!OutOfRangeAddrs.empty())
+    Modified |= processCommonAddrs(OutOfRangeAddrs);
+  return Modified;
+}
+
+/***********************************************************************
+ * processCommonAddrsWithValidOffsets : common up some address conversions
+ *
+ * Enter:   Addrs = two or more address conversion instructions that all have
+ *          the same input and address the same base register, with no
+ *          duplicates, and all have valid in range offsets (add.addr intrinsics)
+ *
+ * This function relies on there being no duplicates in Addrs in the way that
+ * it erases the address conversions other than the one it uses as the common
+ * one.
+ */
+void GenXAddressCommoning::processCommonAddrsWithValidOffsets(
+    ArrayRef<Instruction *> Addrs)
+{
+  // Find the address conversion that dominates all the others.
+  Instruction *DominatingAddr = findClosestCommonDominator(
+      getDominatorTree(), Addrs);
+  if (DominatingAddr && DominatingAddr->isTerminator()) {
+    // Ensure we have a legal insertion point in the presence of SIMD CF.
+    auto InsertBefore = GotoJoin::getLegalInsertionPoint(DominatingAddr,
+          getDominatorTree());
+    // We did not find one address conversion that dominates all of them.  Move
+    // an arbitrarily chosen one to the end of the dominating basic block.
+    // This position dominates the other address conversions, and is dominated
+    // by the index input value.
+    // We need to move the entire bale, not just the address conversion
+    // instruction itself. The whole bale is given an instruction number the
+    // same as the terminator of the closest common dominator block that it is
+    // being inserted before. Doing this is a bit dodgy because the result of
+    // the address conversion does not appear to interfere with the operands
+    // of a cmp baled into a conditional branch, but in practice this is not
+    // a problem because the result of an address conversion is an address
+    // register and the 
+    unsigned Num = Numbering->getNumber(InsertBefore);
+    Bale B;
+    Baling->buildBale(Addrs[0], &B);
+    for (auto i = B.begin(), e = B.end(); i != e; ++i) {
+      Instruction *Inst = i->Inst;
+      DominatingAddr = Inst;
+      Inst->removeFromParent();
+      Inst->insertBefore(InsertBefore);
+      Numbering->setNumber(Inst, Num);
+    }
+  }
+  // Use the dominating one instead of all the others.
+  for (auto i = Addrs.begin(), e = Addrs.end(); i != e; ++i) {
+    Instruction *Addr = *i;
+    if (Addr == DominatingAddr)
+      continue;
+    Addr->replaceAllUsesWith(DominatingAddr);
+    do {
+      auto Next = dyn_cast<Instruction>(Addr->getOperand(0));
+      Liveness->removeValue(Addr);
+
+      // It happens that after commoning there are unused dangling instructions
+      // in some cases and vISA writer asserts.
+      bool EraseAddr = true;
+      if (GenXIntrinsic::getGenXIntrinsicID(Addr) ==
+              GenXIntrinsic::genx_rdregioni ||
+          GenXIntrinsic::getGenXIntrinsicID(Addr) ==
+              GenXIntrinsic::genx_rdregionf) {
+         Value *Idx = Addr->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum);
+         auto II = dyn_cast<Instruction>(Idx);
+         if (II && II->hasOneUse()) {
+           Addr->eraseFromParent();
+           EraseAddr = false;
+
+           assert(II->use_empty());
+           Liveness->removeValue(II);
+           II->eraseFromParent();
+         }
+      }
+      if (EraseAddr)
+        Addr->eraseFromParent();
+
+      Addr = Next;
+    } while (Addr && Addr->use_empty());
+  }
+  // Rebuild the live range for the common address calculation.
+  // Note that we do not rebuild the live ranges for the input(s) to the
+  // common address calculation bale; this is conservative.
+  Liveness->rebuildLiveRange(Liveness->getLiveRange(DominatingAddr));
+}
+
+/***********************************************************************
+ * vectorizeAddrs : attempt to vectorize address conversions for one base reg
+ *
+ * Enter:   LR = LiveRange with all the values for this base register
+ *
+ * If there are multiple scalar address conversions for this base reg where
+ * the index is an extract from the same vector, we attempt to common them up
+ * into a vector address conversion with extracts from the result. This is the
+ * histogram optimization.
+ */
+bool GenXAddressCommoning::vectorizeAddrs(LiveRange *LR)
+{
+  // Gather the address conversions from an extract from a vector used by
+  // regions of this base register into buckets, one for each distinct vector
+  // being extracted from and each distinct address conversion offset.
+  std::map<std::pair<Value *, int>, ExtractBucket> ExtractBuckets;
+  for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) {
+    Value *V = vi->getValue();
+    // Ignore the value if it is in the wrong function. That can happen because
+    // liveness information is shared between functions in the same group.
+    if (!isValueInCurrentFunc(V))
+      continue;
+    // First the def, if it is a wrregion.
+    if (GenXIntrinsic::isWrRegion(V)) {
+      Value *Index = cast<Instruction>(V)->getOperand(
+          GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+      addAddrConvIfExtract(&ExtractBuckets, Index);
+    }
+    // Then each use that is a rdregion. (A use that is a wrregion will be
+    // handled when we look at that value, which must be coalesced into the
+    // same live range.)
+    for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) {
+      if (ui->getOperandNo() != GenXIntrinsic::GenXRegion::OldValueOperandNum)
+        continue;
+      auto user = cast<Instruction>(ui->getUser());
+      if (!GenXIntrinsic::isRdRegion(user))
+        continue;
+      Value *Index = user->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum);
+      addAddrConvIfExtract(&ExtractBuckets, Index);
+    }
+  }
+  // Process each bucket of address calculations that extract from the
+  // same vector.
+  bool Modified = false;
+  for (auto i = ExtractBuckets.begin(), e = ExtractBuckets.end(); i != e; ++i)
+    if (i->second.Addrs.size() >= 2)
+      Modified |= vectorizeAddrsFromOneVector(i->second.Addrs);
+  return Modified;
+}
+
+/***********************************************************************
+ * addAddrConvIfExtract : add an address conversion to the appropriate
+ *        bucket if the address is an extract from a vector
+ *
+ * Enter:   ExtractBuckets = map of buckets
+ *          Index = index operand from rdregion/wrregion
+ *
+ * Possibly after traversing some add_addr ops, Index is a constant or a
+ * convert_addr.  If it is a convert_addr whose input is an extract (a scalar
+ * rdregion) with a single use, add the convert_addr to the bucket for the
+ * vector that the extract is extracted from.
+ */
+void GenXAddressCommoning::addAddrConvIfExtract(
+    std::map<std::pair<Value *, int>, ExtractBucket> *ExtractBuckets, Value *Index)
+{
+  while (GenXIntrinsic::getGenXIntrinsicID(Index) ==
+         GenXIntrinsic::genx_add_addr)
+    Index = cast<Instruction>(Index)->getOperand(0);
+  if (isa<Constant>(Index))
+    return;
+  assert(GenXIntrinsic::getGenXIntrinsicID(Index) ==
+         GenXIntrinsic::genx_convert_addr);
+  auto RdR = dyn_cast<Instruction>(cast<Instruction>(Index)->getOperand(0));
+  if (!GenXIntrinsic::isRdRegion(RdR))
+    return;
+  assert(RdR);
+  if (!isa<Constant>(RdR->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum)))
+    return;
+  if (!RdR->hasOneUse())
+    return;
+  auto AddrConv = cast<Instruction>(Index);
+  int AddrConvOffset = cast<ConstantInt>(AddrConv->getOperand(1))->getSExtValue();
+  (*ExtractBuckets)[std::pair<Value *, int>(RdR->getOperand(0), AddrConvOffset)]
+        .add(AddrConv);
+}
+
+/***********************************************************************
+ * tryConvertWholeRegion : attempt to convert whole region
+ *
+ * Enter:   Extracts -- array of address conversions, extracted from
+ *                      inputs to vectorizeAddrsFromOneVector,
+ *                      combined with corresponding region offset
+ *          VecDef -- instruction definition from first extract
+ *
+ * This is subroutine of vectorizeAddrsFromOneVector, see more comments 
+ * in parent function. Idea of this subroutine is to convert whole
+ * region if possible
+ */
+bool GenXAddressCommoning::tryConvertWholeRegion(SmallVector<Extract, 4> &Extracts, Instruction *VecDef) {
+  Instruction *InsertBefore = Extracts[0].Addr;
+  unsigned int MinNum, MaxNum;
+
+  // maximal difference between MinNum and MaxNum to accept region
+  // TODO: to be tuned?
+  const int SIZE_THRESHOLD = 48;
+
+  MinNum = MaxNum = Numbering->getNumber(InsertBefore);
+  // check every extract
+  for (unsigned Idx = 0, End = Extracts.size(); Idx < End; ++Idx) {
+    Instruction *RdR = cast<Instruction>(Extracts[Idx].Addr->getOperand(0));
+    Region R(RdR, BaleInfo());
+    if (R.NumElements > 1 && R.Stride > 1)
+      return false;
+    // all address-conv must be in the same basic block
+    if (Extracts[Idx].Addr != InsertBefore &&
+      Extracts[Idx].Addr->getParent() != InsertBefore->getParent()) {
+      LLVM_DEBUG(errs() << "tryConvertWholeRegion: not all in the same block\n");
+      return false;
+    }
+    // test to update the insertion-point
+    unsigned int ThisNum = Numbering->getNumber(Extracts[Idx].Addr);
+    if (ThisNum < MinNum) {
+      InsertBefore = Extracts[Idx].Addr;
+      MinNum = ThisNum;
+    }
+    if (ThisNum > MaxNum)
+      MaxNum = ThisNum;
+  }
+  if ((MaxNum - MinNum) > SIZE_THRESHOLD)
+    return false;
+  // Create a vectorized address conversion and bale the new rdregion (if
+  // any) into it. Give the new vectorized address conversion, and the new
+  // rdregion (if any), the number of one less than the insert point.
+  int AddrConvOffset =
+    cast<ConstantInt>(Extracts[0].Addr->getOperand(1))->getSExtValue();
+  auto NewConv = createConvertAddr(VecDef, AddrConvOffset,
+    Extracts[0].Addr->getName() + ".monted", InsertBefore);
+  NewConv->setDebugLoc(VecDef->getDebugLoc());
+  Numbering->setNumber(NewConv, Numbering->getNumber(VecDef) + 1);
+  // For each original address conversion, replace it with an
+  // extract from the vectorized convert, and bale the extract into
+  // its use. If it has more than one use, create an extract per use
+  // (because a baled in instruction must be single use).
+  for (unsigned Idx2 = 0, End2 = Extracts.size(); Idx2 < End2; ++Idx2) {
+    auto OldConv = Extracts[Idx2].Addr;
+    Instruction *OldExtract = cast<Instruction>(OldConv->getOperand(0));
+    Region R2(OldExtract, BaleInfo());
+    while (!OldConv->use_empty()) {
+      auto ui = OldConv->use_begin();
+      auto user = cast<Instruction>(ui->getUser());
+      auto NewExtract = R2.createRdRegion(NewConv, OldConv->getName(), user,
+        user->getDebugLoc(), /*ScalarAllowed=*/!OldConv->getType()->isVectorTy());
+      Numbering->setNumber(NewExtract, Numbering->getNumber(user));
+      // At this late stage, I believe nothing relies on the baling type for
+      // this instruction being set to RDREGION, but we set it anyway for
+      // completeness.
+      Baling->setBaleInfo(NewExtract, BaleInfo(BaleInfo::RDREGION));
+      BaleInfo BI = Baling->getBaleInfo(user);
+      BI.setOperandBaled(ui->getOperandNo());
+      Baling->setBaleInfo(user, BI);
+      *ui = NewExtract;
+    }
+    Liveness->removeValue(OldConv);
+    assert(!Liveness->getLiveRangeOrNull(OldExtract) && "expected extract to be baled in");
+    OldConv->eraseFromParent();
+    OldExtract->eraseFromParent();
+  }
+  // Give the new vectorized address conversion a live range.
+  auto LR = Liveness->getOrCreateLiveRange(NewConv);
+  LR->setCategory(RegCategory::ADDRESS);
+  Liveness->rebuildLiveRange(LR);
+  return true;
+}
+
+/***********************************************************************
+ * vectorizeAddrsFromOneVector : attempt to vectorize address conversions
+ *
+ * Enter:   Addrs = address conversions for the same base reg with the same
+ *                  offset that are all scalar rdregion (constant offset) from
+ *                  the same vector, at least two of them
+ *
+ * If there are multiple scalar address conversions for this base reg where the
+ * index is an extract from the same vector, we attempt to common them up into
+ * a vector address conversion with extracts from the result. This is the
+ * histogram optimization.
+ */
+bool GenXAddressCommoning::vectorizeAddrsFromOneVector(
+    ArrayRef<Instruction *> Addrs)
+{
+  bool Modified = false;
+  SmallVector<Extract, 4> Extracts;
+  bool HasVector = false;
+  std::set<int> OffsetSet;
+  LLVM_DEBUG(dbgs() << "Collecting addrs: " << Addrs.size() << "\n");
+
+  for (auto i = Addrs.begin(), e = Addrs.end(); i != e; ++i) {
+    Instruction *Addr = *i;
+    LLVM_DEBUG(Addr->dump());
+
+    Region R(cast<Instruction>(Addr->getOperand(0)), BaleInfo());
+    LLVM_DEBUG(dbgs() << " [" << R.Offset << "]\n");
+
+    Extracts.push_back(Extract(Addr, R.Offset));
+    OffsetSet.insert(R.Offset);
+    if (isa<VectorType>(Addr->getType()))
+      HasVector = true;
+  }
+  bool ConvertWholeRegion = false;
+  Instruction *FirstRdR = cast<Instruction>(Extracts[0].Addr->getOperand(0));
+  assert(FirstRdR);
+  Instruction *VecDef = cast<Instruction>(FirstRdR->getOperand(0));
+  assert(VecDef);
+
+  unsigned InputNumElements = VecDef->getType()->getVectorNumElements();
+
+  if (HasVector) {
+    if (InputNumElements == 2 || InputNumElements == 4 ||
+        InputNumElements == 8 || InputNumElements == 16)
+      ConvertWholeRegion = true;
+    else
+      return Modified;
+  }
+  else if (OffsetSet.size()*3 >= InputNumElements*2 &&
+    (InputNumElements == 2 || InputNumElements == 4 ||
+      InputNumElements == 8 || InputNumElements == 16))
+    ConvertWholeRegion = true;
+
+  // Sort into offset order.
+  std::sort(Extracts.begin(), Extracts.end());
+
+  if (ConvertWholeRegion) {
+    bool Success = tryConvertWholeRegion(Extracts, VecDef);
+    if (Success) {
+      LLVM_DEBUG(dbgs() << "Succesfully converted whole region\n");
+      return true;
+    }
+
+    LLVM_DEBUG(dbgs() << "Failed to convert whole region\n");
+    if (!ConvertAfterWholeRegion)
+      return false;
+  }
+
+  // if we tried to convert whole region and failed
+  // we shall check that we will try to optimize further
+  // correct extract set
+  assert(Extracts.size() > 0);
+  Type *FirstType = Extracts[0].Addr->getOperand(0)->getType();
+  assert(FirstType);
+
+  for (auto e : Extracts) {
+    Type *Tp = e.Addr->getOperand(0)->getType();
+    if (ConvertWholeRegion && (Tp != FirstType))
+      return false;
+  }
+
+  // Scan through the address conversions...
+  for (unsigned Idx = 0, Num = 1, End = Extracts.size();
+      Idx < End - 2; Idx += Num) {
+    // See how many extracts we can take in one go that have evenly spaced
+    // offsets, max 8.
+    int Diff = Extracts[Idx + 1].Offset - Extracts[Idx].Offset;
+    for (Num = 2; Num != 8 && Num != End - Idx; ++Num)
+      if (Extracts[Idx + Num].Offset - Extracts[Idx + Num - 1].Offset != Diff)
+        break;
+    if (Num == 1)
+      continue;
+    // We have a sequence of more than one extract. Construct the region
+    // parameters for it.
+    Instruction *FirstRdR = cast<Instruction>(Extracts[Idx].Addr->getOperand(0));
+    LLVM_DEBUG(dbgs() << "Sequence of " << Num << " instructions found. First one is:\n");
+    LLVM_DEBUG(FirstRdR->dump());
+    LLVM_DEBUG(dbgs() << "\n");
+    Region R(FirstRdR, BaleInfo());
+    R.NumElements = R.Width = Num;
+    R.Stride = Diff / R.ElementBytes;
+    // See how big we can legally make the region.
+    unsigned InputNumElements = FirstRdR
+          ->getOperand(0)->getType()->getVectorNumElements();
+    Num = R.getLegalSize(0, /*Allow2D=*/true, InputNumElements, ST);
+    if (Num == 1)
+      continue;
+    // Even after legalizing the region, we can still vectorize to more than
+    // one element.
+    R.getSubregion(0, Num);
+    // Determine where to insert the new rdregion (if any) and vectorized
+    // address conversion.
+    SmallVector<Instruction *, 4> Addrs;
+    for (unsigned i = 0; i != Num; ++i)
+      Addrs.push_back(Extracts[Idx + i].Addr);
+    auto InsertBefore = findClosestCommonDominator(getDominatorTree(), Addrs);
+    // Ensure we have a legal insertion point in the presence of SIMD CF.
+    InsertBefore = GotoJoin::getLegalInsertionPoint(InsertBefore,
+          getDominatorTree());
+    // Read the region containing all the scalar indices we are commoning
+    // up. (If R is the identity region, just use the whole original vector
+    // instead.)
+    Value *NewRdR = cast<Instruction>(Extracts[Idx].Addr->getOperand(0))
+        ->getOperand(0);
+    Instruction *NewRdRInst = nullptr;
+    if (InputNumElements != R.NumElements) {
+      // Not identity region.
+      NewRdR = NewRdRInst = R.createRdRegion(NewRdR,
+          Extracts[Idx].Addr->getName(), InsertBefore,
+          Extracts[Idx].Addr->getDebugLoc(), false);
+      Baling->setBaleInfo(NewRdRInst, BaleInfo(BaleInfo::RDREGION));
+    }
+    // Create a vectorized address conversion and bale the new rdregion (if
+    // any) into it. Give the new vectorized address conversion, and the new
+    // rdregion (if any), the number of one less than the insert point.
+    int AddrConvOffset = cast<ConstantInt>(Addrs[0]->getOperand(1))->getSExtValue();
+    auto NewConv = createConvertAddr(NewRdR, AddrConvOffset,
+        Extracts[Idx].Addr->getName() + ".histogrammed", InsertBefore);
+    NewConv->setDebugLoc(Extracts[Idx].Addr->getDebugLoc());
+    Numbering->setNumber(NewConv, Numbering->getNumber(InsertBefore) - 1);
+    if (NewRdRInst) {
+      Numbering->setNumber(NewRdRInst, Numbering->getNumber(InsertBefore) - 1);
+      BaleInfo BI(BaleInfo::MAININST);
+      BI.setOperandBaled(0);
+      Baling->setBaleInfo(NewConv, BI);
+    }
+    // For each original scalar address conversion, replace it with an
+    // extract from the vectorized convert, and bale the extract in to
+    // its use. If it has more than one use, create an extract per use
+    // (because a baled in instruction must be single use).
+    for (unsigned Idx2 = 0; Idx2 != Num; ++Idx2) {
+      auto OldConv = Extracts[Idx + Idx2].Addr;
+      Region R2(NewConv);
+      R2.getSubregion(Idx2, 1);
+      while (!OldConv->use_empty()) {
+        auto ui = OldConv->use_begin();
+        auto user = cast<Instruction>(ui->getUser());
+        auto NewExtract = R2.createRdRegion(NewConv, OldConv->getName(), user,
+            user->getDebugLoc(), /*ScalarAllowed=*/true);
+        Numbering->setNumber(NewExtract, Numbering->getNumber(user));
+        // At this late stage, I believe nothing relies on the baling type for
+        // this instruction being set to RDREGION, but we set it anyway for
+        // completeness.
+        Baling->setBaleInfo(NewExtract, BaleInfo(BaleInfo::RDREGION));
+        BaleInfo BI = Baling->getBaleInfo(user);
+        BI.setOperandBaled(ui->getOperandNo());
+        Baling->setBaleInfo(user, BI);
+        *ui = NewExtract;
+      }
+      Liveness->removeValue(OldConv);
+      auto OldExtract = cast<Instruction>(OldConv->getOperand(0));
+      assert(!Liveness->getLiveRangeOrNull(OldExtract) && "expected extract to be baled in");
+      OldConv->eraseFromParent();
+      OldExtract->eraseFromParent();
+    }
+    // Give the new vectorized address conversion a live range.
+    auto LR = Liveness->getOrCreateLiveRange(NewConv);
+    LR->setCategory(RegCategory::ADDRESS);
+    Liveness->rebuildLiveRange(LR);
+    Modified = true;
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * getDominatorTree : get dominator tree for current function
+ */
+DominatorTree *GenXAddressCommoning::getDominatorTree()
+{
+  return getAnalysis<DominatorTreeGroupWrapperPass>().getDomTree(F);
+}
+
+/***********************************************************************
+ * isValueInCurrentFunc : determine whether V is in the current function
+ *
+ * Enter:   V = value from a LiveRange (therefore it is an Instruction
+ *              or an Argument)
+ *
+ * Return:  true if it is in the current function
+ */
+bool GenXAddressCommoning::isValueInCurrentFunc(Value *V)
+{
+  if (auto Inst = dyn_cast<Instruction>(V)) {
+    auto BB = Inst->getParent();
+    if (!BB)
+      return false;; // unified return value
+    return BB->getParent() == F;
+  }
+  if (isa<GlobalVariable>(V))
+    return false;
+  return cast<Argument>(V)->getParent() == F;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAggregatePseudoLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAggregatePseudoLowering.cpp
new file mode 100644
index 000000000000..4c1ab6ba4d60
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAggregatePseudoLowering.cpp
@@ -0,0 +1,366 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXAggregatePseudoLowering
+/// ---------------------------
+///
+/// The pass is meant to replace all instructions that work with aggregate
+/// values with instructions that work with elementary types (scalar, vector),
+/// so there's no aggregate values in IR at all. But this pass doesn't do full
+/// job, that's why it has pseudo in its name.
+/// This pass replaces every instruction (except call, extract/insertvalue, etc)
+/// that either has aggregate as operand, or returns an aggregate with series
+/// of extractvalue instructions (if there was an aggregate operand) which
+/// return only elementary values, then sequence of splits of the original
+/// instruction (but now each one is working only with an elementary value) and
+/// finally the sequence of insertvalues that join all elementary results back
+/// to the original aggregate result.
+///
+/// Example:
+/// Before pass:
+///   %struct_t = type { <16 x float>, <16 x float>, <16 x float> }
+///   %res = select i1 %c, %struct_t %arg.0, %struct_t %arg.1
+/// After pass:
+///   %struct_t = type { <16 x float>, <16 x float>, <16 x float> }
+///   %arg.0.0 = extractvalue %struct_t %arg.0, 0
+///   %arg.0.1 = extractvalue %struct_t %arg.0, 1
+///   %arg.0.2 = extractvalue %struct_t %arg.0, 2
+///   %arg.1.0 = extractvalue %struct_t %arg.1, 0
+///   %arg.1.1 = extractvalue %struct_t %arg.1, 1
+///   %arg.1.2 = extractvalue %struct_t %arg.1, 2
+///   %res.0 = select i1 %c, <16 x float> %arg.0.0, <16 x float> %arg.1.0
+///   %res.1 = select i1 %c, <16 x float> %arg.0.1, <16 x float> %arg.1.1
+///   %res.2 = select i1 %c, <16 x float> %arg.0.2, <16 x float> %arg.1.2
+///   %tmp.0 = insertvalue %struct_t undef,  <16 x float> %res.0, 0
+///   %tmp.1 = insertvalue %struct_t %tmp.0, <16 x float> %res.1, 1
+///   %res   = insertvalue %struct_t %tmp.1, <16 x float> %res.2, 2
+///
+/// As you can see the pass doesn't fully get rid of aggregate values, it only
+/// locally replaces operations over aggregates with operations over elementary
+/// fields of aggregates. But if there is the instruction combine pass after
+/// this pass, it can easily merge extractvalue and insertvalue so the there's
+/// no aggregate values in code anymore.
+///
+/// Terminology:
+/// Split instructions - the instructions into which original instruction
+///                      is split, e.g. %res.0, %res.1, %res.2 are split insts
+///                      (%res is corresponding original instruction)
+/// Split operands - the instructions into which original operands are split,
+///                  they are always extractvalue instructions, e.g.
+///                  %arg.0.0, %arg.0.1, %arg.0.2 are split operands
+///                  (%arg.0 is corresponding original operand)
+///
+/// Note: split instruction operands is operands of a split instruction, not
+/// split operands, though split instruction operands contain at least one
+/// split operand, e.g. %c, %arg.0.0, %arg.1.0 for %res.0 instruction.
+///
+/// TODO: currently this pass can only handle only flat structures (without
+/// nested aggregates). Supported instructions are phi and select.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GenX.h"
+#include "GenXModule.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+#include <unordered_map>
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// It is a map between original aggregate instruction operand
+// and corresponding split operands.
+// Split operands are always extractvalue instructions.
+using SplitOpsMap = std::unordered_map<Use *, std::vector<Instruction *>>;
+
+class GenXAggregatePseudoLowering : public FunctionPass {
+  std::vector<Instruction *> ToErase;
+
+public:
+  static char ID;
+  explicit GenXAggregatePseudoLowering() : FunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "GenX aggregate pseudo lowering";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+
+private:
+  void processInst(Instruction &Inst);
+};
+
+} // end namespace
+
+char GenXAggregatePseudoLowering::ID = 0;
+namespace llvm {
+void initializeGenXAggregatePseudoLoweringPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(GenXAggregatePseudoLowering,
+                      "GenXAggregatePseudoLowering",
+                      "GenXAggregatePseudoLowering", false, false)
+INITIALIZE_PASS_END(GenXAggregatePseudoLowering, "GenXAggregatePseudoLowering",
+                    "GenXAggregatePseudoLowering", false, false)
+
+FunctionPass *llvm::createGenXAggregatePseudoLoweringPass() {
+  initializeGenXAggregatePseudoLoweringPass(*PassRegistry::getPassRegistry());
+  return new GenXAggregatePseudoLowering;
+}
+
+void GenXAggregatePseudoLowering::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+// is at least one of instruction's operands an aggregate value
+static bool hasAggregateOperand(const Instruction &Inst) {
+  return llvm::any_of(Inst.operand_values(), [](const Value *V) {
+    return V->getType()->isAggregateType();
+  });
+}
+
+// does instruction have an aggregate as an operand or return value
+static bool hasAggregate(const Instruction &Inst) {
+  return Inst.getType()->isAggregateType() || hasAggregateOperand(Inst);
+}
+
+bool GenXAggregatePseudoLowering::runOnFunction(Function &F) {
+  std::vector<Instruction *> WorkList;
+  auto WorkRange = make_filter_range(instructions(F), [](Instruction &Inst) {
+    return hasAggregate(Inst) && !isa<InsertValueInst>(Inst) &&
+           !isa<ExtractValueInst>(Inst) && !isa<CallInst>(Inst) &&
+           !isa<ReturnInst>(Inst);
+  });
+  llvm::transform(WorkRange, std::back_inserter(WorkList),
+                  [](Instruction &Inst) { return &Inst; });
+  if (WorkList.empty())
+    return false;
+
+  for (auto *Inst : WorkList)
+    processInst(*Inst);
+
+  for (auto *Inst : ToErase)
+    Inst->eraseFromParent();
+  ToErase.clear();
+  return true;
+}
+
+// Returns first instruction after provided instruciton \p Inst,
+// before which new instruction can be inserted.
+static Instruction *getFirstInsertionPtAfter(Instruction &Inst) {
+  if (isa<PHINode>(Inst))
+    return Inst.getParent()->getFirstNonPHI();
+  return Inst.getNextNode();
+}
+
+// Returns first instruction before which new instruction that represent new
+// operand can be inserted, so the new instruction precedes provided
+// instruction. \p Inst. Operand \Op is the operator to be updated.
+static Instruction *getFirstInsertionPtBefore(Use &Op, Instruction &Inst) {
+  if (!isa<PHINode>(Inst))
+    return &Inst;
+  return cast<PHINode>(Inst).getIncomingBlock(Op)->getTerminator();
+}
+
+// Arguments:
+//    \p Inst - an instruction
+//    \p Op - operand of the instruction \p Inst
+//
+// Returns an instruction before which new operand for instruction \p Inst,
+// that correspond to the operand \p Op, can be inserted
+static Instruction *getInsertionPtForSplitOp(Use &Op, Instruction &Inst) {
+  auto &OpVal = *Op.get();
+  if (isa<Instruction>(OpVal))
+    return getFirstInsertionPtAfter(cast<Instruction>(OpVal));
+  assert(isa<Constant>(OpVal) && "only instruction or constant are expected");
+  return getFirstInsertionPtBefore(Op, Inst);
+}
+
+// Arguments:
+//    \p Inst - an instruction
+//    \p Op - operand of the instruction \p Inst
+//
+// Splits operand \p Op of the instruction \p Inst into elementary values.
+// Those values are extractvalue instructions. Inserts those instruction in
+// proper places, so if we insert new instruction right after or right before
+// \p Inst those instructions could be reached.
+//
+// Returns the vector of created instructions.
+static std::vector<Instruction *> createSplitOperand(Use &Op,
+                                                     Instruction &Inst) {
+  auto &OpVal = *Op.get();
+  assert(OpVal.getType()->isAggregateType() && "wrong argument");
+  // TODO: support ArrayType
+  auto *InstTy = cast<StructType>(OpVal.getType());
+  auto *InsertionPt = getInsertionPtForSplitOp(Op, Inst);
+  std::vector<Instruction *> SplitOperand;
+  for (unsigned i = 0; i < InstTy->getNumElements(); ++i) {
+    assert(!InstTy->getElementType(i)->isAggregateType() &&
+           "folded structures is yet unsupported");
+    SplitOperand.push_back(
+        ExtractValueInst::Create(&OpVal, i, "", InsertionPt));
+  }
+  return SplitOperand;
+}
+
+// Arguments:
+//    \p Inst - an instruction
+//
+// Splits all aggregate operands of provided \p Inst.
+// Returns a map between original operands and created instructions.
+static SplitOpsMap createSplitOperands(Instruction &Inst) {
+  assert(hasAggregateOperand(Inst) &&
+         "wrong argument: inst must have aggregate operand");
+  auto AggregateOps = make_filter_range(Inst.operands(), [](const Use &U) {
+    return U->getType()->isAggregateType();
+  });
+
+  SplitOpsMap SplitOps;
+  llvm::transform(AggregateOps, std::inserter(SplitOps, SplitOps.end()),
+                  [&Inst](Use &U) {
+                    return std::make_pair(&U, createSplitOperand(U, Inst));
+                  });
+  return SplitOps;
+}
+
+// Arguments:
+//    \p elemIdx - element index of the aggregate for which we construct
+//                 split instruction
+//    \p OrigOps - original instruction operands (contain aggregates)
+//    \p SplitOps - map between original aggregate operands and corresponding
+//                  split operands
+//
+// Returns vector of operands (as Value*) for split instruction with index \p
+// elemIdx.
+template <typename OpRange>
+std::vector<Value *> createSplitInstOperands(int elemIdx, OpRange OrigOps,
+                                             const SplitOpsMap &SplitOps) {
+  std::vector<Value *> NewOps;
+  llvm::transform(OrigOps, std::back_inserter(NewOps),
+                  [elemIdx, &SplitOps](Use &OrigOp) -> Value * {
+                    if (OrigOp.get()->getType()->isAggregateType())
+                      return SplitOps.at(&OrigOp)[elemIdx];
+                    return OrigOp.get();
+                  });
+  return NewOps;
+}
+
+// Arguments:
+//    \p Inst - original instruction
+//    \p NewOps - operands for split instruction
+//
+// Creates split instruction based on the kind of original instruction.
+// New instruction is inserted right before \p Inst.
+// Split instruction is returned.
+static Instruction *createSplitInst(Instruction &Inst,
+                                    const std::vector<Value *> &NewOps) {
+  if (isa<SelectInst>(Inst)) {
+    assert(NewOps.size() == 3 && "select must have 3 operands");
+    auto *NewSelect =
+        SelectInst::Create(NewOps[0], NewOps[1], NewOps[2],
+                           Inst.getName() + ".split.aggr", &Inst, &Inst);
+    NewSelect->setDebugLoc(Inst.getDebugLoc());
+    return NewSelect;
+  }
+  assert(isa<PHINode>(Inst) && "unsupported instruction");
+  assert(Inst.getNumOperands() == NewOps.size() && "");
+  auto *NewPHI = PHINode::Create(NewOps[0]->getType(), NewOps.size(),
+                                 Inst.getName() + ".split.aggr", &Inst);
+
+  auto &OldPHI = cast<PHINode>(Inst);
+  for (auto &&Incoming : zip(NewOps, OldPHI.blocks())) {
+    Value *OpVal = std::get<0>(Incoming);
+    BasicBlock *OpBB = std::get<1>(Incoming);
+    assert(isa<ExtractValueInst>(OpVal) &&
+           "phi operands must be previously in this pass created "
+           "extractvalue insts");
+    auto *OpInst = cast<Instruction>(OpVal);
+    NewPHI->addIncoming(OpInst, OpBB);
+  }
+  NewPHI->setDebugLoc(Inst.getDebugLoc());
+  return NewPHI;
+}
+
+// Arguments:
+//    \p Inst - original instruction
+//    \p SplitOps - map between original aggregate operands and corresponding
+//                  elementary operands
+//
+// Creates all split instructions for original \p Inst, inserts them before the
+// original one. Returns vector of created split instructions.
+static std::vector<Instruction *>
+createSplitInsts(Instruction &Inst, const SplitOpsMap &SplitOps) {
+  // TODO: support ArrayType
+  auto &InstTy = *cast<StructType>(Inst.getType());
+  int NumNewInsts = InstTy.getNumElements();
+  std::vector<Instruction *> NewInsts;
+  NewInsts.reserve(NumNewInsts);
+  for (int i = 0; i < NumNewInsts; ++i) {
+    auto NewOps = createSplitInstOperands(i, Inst.operands(), SplitOps);
+    NewInsts.push_back(createSplitInst(Inst, NewOps));
+  }
+  return NewInsts;
+}
+
+// Arguments:
+//    \p SplitInsts - split instructions
+//    \p JoinTy - aggregate type that all split instructions together should
+//                form \p InsertBefore - insertion point
+//
+// Combines split instructions back into aggregate value with a sequence of
+// inservalue instructions.
+// Last insertvalue instruction that form full aggregate value is returned.
+static Instruction *joinSplitInsts(const std::vector<Instruction *> &SplitInsts,
+                                   Type *JoinTy, Instruction *InsertBefore) {
+  assert(SplitInsts.size() == cast<StructType>(JoinTy)->getNumElements() &&
+         "number of splitted insts doesn't correspond with aggregate type");
+  Value *JoinInst = UndefValue::get(JoinTy);
+  unsigned Idx = 0;
+  for (auto *SplitInst : SplitInsts) {
+    JoinInst =
+        InsertValueInst::Create(JoinInst, SplitInst, Idx++, "", InsertBefore);
+  }
+  return cast<Instruction>(JoinInst);
+}
+
+void GenXAggregatePseudoLowering::processInst(Instruction &Inst) {
+  assert(hasAggregate(Inst) &&
+         "wrong argument: instruction doesn't work with aggregates");
+  SplitOpsMap NewOperands;
+  if (hasAggregateOperand(Inst))
+    NewOperands = createSplitOperands(Inst);
+  auto NewInsts = createSplitInsts(Inst, NewOperands);
+  auto *JoinInst =
+      joinSplitInsts(NewInsts, Inst.getType(), getFirstInsertionPtAfter(Inst));
+  Inst.replaceAllUsesWith(JoinInst);
+  ToErase.push_back(&Inst);
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.cpp
new file mode 100644
index 000000000000..2ca18722a8e7
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.cpp
@@ -0,0 +1,401 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// AlignmentInfo is a cache of information on the alignment of instruction
+// values in a function. Alignment is stored as LogAlign and ExtraBits
+// (ExtraBits < 1 << LogAlign) where a value is known to be
+// A << LogAlign | ExtraBits.
+//
+// For a vector value, the alignment information is for element 0.
+//
+// The alignment of a value is computed as it is required, rather than all
+// values in a function being computed in a separate analysis pass.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_ALIGNMENT_INFO"
+
+#include <algorithm>
+#include "GenX.h"
+#include "GenXAlignmentInfo.h"
+#include "GenXBaling.h"
+#include "GenXRegion.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include <set>
+
+using namespace llvm;
+using namespace genx;
+
+/***********************************************************************
+ * AlignmentInfo::get : get the aligmment of a Value
+ *
+ * Return:  the Alignment
+ */
+Alignment AlignmentInfo::get(Value *V)
+{
+  if (auto C = dyn_cast<Constant>(V))
+    return Alignment(C);
+  auto Inst = dyn_cast<Instruction>(V);
+  if (!Inst) {
+    // An Argument has unknown alignment.
+    // (FIXME: We may need to do better than this, tracing the value of the
+    // Argument at call sites, when arg indirection is introduced.)
+    return Alignment::getUnknown();
+  }
+  auto MapEntry = &InstMap[Inst];
+  if (!MapEntry->isUncomputed())
+    return *MapEntry; // already in cache
+  // Need to compute for this instruction.
+  LLVM_DEBUG(dbgs() << "AlignmentInfo::get: computing alignment for " << Inst->getName() << "\n");
+  // Get the web of instructions related to this one, including going through
+  // phi nodes, excluding ones that we already have alignment for.
+  std::set<Instruction *> InstWebSet;
+  SmallVector<Instruction *, 8> InstWeb;
+  InstWebSet.insert(Inst);
+  InstWeb.push_back(Inst);
+  for (unsigned i = 0; i != InstWeb.size(); ++i) {
+    auto WorkInst = InstWeb[i];
+    if (auto Phi = dyn_cast<PHINode>(WorkInst)) {
+      for (unsigned ii = 0, ie = Phi->getNumIncomingValues(); ii != ie; ++ii)
+        if (auto IncomingInst = dyn_cast<Instruction>(Phi->getIncomingValue(ii)))
+          if (InstMap.find(IncomingInst) == InstMap.end()
+              && InstWebSet.insert(IncomingInst).second)
+            InstWeb.push_back(IncomingInst);
+    } else if (isa<BinaryOperator>(WorkInst) || isa<CastInst>(WorkInst)) {
+      for (unsigned oi = 0, oe = WorkInst->getNumOperands(); oi != oe; ++oi)
+        if (auto IncomingInst = dyn_cast<Instruction>(WorkInst->getOperand(oi)))
+          if (InstMap.find(IncomingInst) == InstMap.end()
+              && InstWebSet.insert(IncomingInst).second)
+            InstWeb.push_back(IncomingInst);
+    } else if (CastInst *CI = dyn_cast<CastInst>(WorkInst)) {
+      if (auto IncomingInst = dyn_cast<Instruction>(WorkInst->getOperand(0)))
+        if (InstMap.find(IncomingInst) == InstMap.end()
+          && InstWebSet.insert(IncomingInst).second)
+          InstWeb.push_back(IncomingInst);
+    } else
+      switch (GenXIntrinsic::getGenXIntrinsicID(WorkInst)) {
+      case GenXIntrinsic::genx_rdregioni:
+      case GenXIntrinsic::genx_rdregionf:
+      case GenXIntrinsic::genx_convert:
+      case GenXIntrinsic::genx_convert_addr:
+        if (auto IncomingInst = dyn_cast<Instruction>(WorkInst->getOperand(0)))
+          if (InstMap.find(IncomingInst) == InstMap.end()
+              && InstWebSet.insert(IncomingInst).second)
+            InstWeb.push_back(IncomingInst);
+        break;
+      case GenXIntrinsic::genx_ssmad:
+      case GenXIntrinsic::genx_uumad:
+      case GenXIntrinsic::genx_add_addr:
+        for (unsigned oi = 0, oe = WorkInst->getNumOperands(); oi != oe; ++oi)
+          if (auto IncomingInst = dyn_cast<Instruction>(WorkInst->getOperand(oi)))
+            if (InstMap.find(IncomingInst) == InstMap.end()
+              && InstWebSet.insert(IncomingInst).second)
+              InstWeb.push_back(IncomingInst);
+        break;
+      default:
+        break;
+    }
+  }
+  LLVM_DEBUG(dbgs() << "web:";
+        for (unsigned i = 0, e = InstWeb.size(); i != e; ++i)
+          dbgs() << " " << InstWeb[i]->getName();
+        dbgs() << "\n");
+  // Use a worklist algorithm where each instruction in the web is initially on
+  // the worklist.
+  std::set<Instruction *> WorkSet;
+  for (auto i = InstWeb.begin(), e = InstWeb.end(); i != e; ++i)
+    WorkSet.insert(*i);
+  while (!InstWeb.empty()) {
+    Instruction *WorkInst = InstWeb.back();
+    InstWeb.pop_back();
+    WorkSet.erase(WorkInst);
+    LLVM_DEBUG(dbgs() << "  processing " << WorkInst->getName() << "\n");
+
+    Alignment A(0, 0); // assume unknown
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(WorkInst)) {
+      A = Alignment(); // assume uncomputed
+      Alignment A0 = getFromInstMap(BO->getOperand(0));
+      Alignment A1 = getFromInstMap(BO->getOperand(1));
+      if (!A0.isUncomputed() && !A1.isUncomputed()) {
+        switch (BO->getOpcode()) {
+          case Instruction::Add:
+            A = A0.add(A1);
+            break;
+          case Instruction::Sub:
+            if (A1.isConstant())
+              A = A0.add(-(A1.getConstBits()));
+            else
+              A = Alignment::getUnknown();
+            break;
+          case Instruction::Mul:
+            A = A0.mul(A1);
+            break;
+          case Instruction::Shl:
+            if (A1.isConstant()) {
+              A1 = Alignment(A1.getConstBits(), 0);
+              A = A0.mul(A1);
+            } else
+              A = Alignment::getUnknown();
+            break;
+        default:
+          A = Alignment::getUnknown();
+          break;
+        }
+      }
+    } else if (CastInst *CI = dyn_cast<CastInst>(WorkInst)) {
+      // Handle a bitcast for the same reason as above. This also handles
+      // trunc, sext, zext.
+      A = getFromInstMap(CI->getOperand(0));
+      if (!A.isUncomputed()) {
+        unsigned LogAlign = A.getLogAlign(), ExtraBits = A.getExtraBits();
+        LogAlign = std::min(
+            LogAlign,
+            static_cast<unsigned>(
+                CI->getType()->getScalarType()->getPrimitiveSizeInBits()));
+        if (LogAlign < 32)
+          ExtraBits &= (1 << LogAlign) - 1;
+        A = Alignment(LogAlign, ExtraBits);
+      } else if (!CI->isIntegerCast()) {
+        // For no-only-integer cast instructions - FPToUI, FPToSI
+        A = Alignment::getUnknown();
+      }
+    } else if (auto Phi = dyn_cast<PHINode>(WorkInst)) {
+      // For a phi node, ignore uncomputed incomings so we have an initial
+      // guess at alignment value to propagate round a loop and refine in
+      // a later visit to this same phi node.
+      A = Alignment(); // initialize to uncomputed
+      for (unsigned ii = 0, ie = Phi->getNumIncomingValues(); ii != ie; ++ii) {
+        LLVM_DEBUG(dbgs() << "  incoming: " << *Phi->getIncomingValue(ii) << "\n");
+        LLVM_DEBUG(dbgs() << "  merging " << A << " and " << getFromInstMap(Phi->getIncomingValue(ii)) << "\n");
+        A = A.merge(getFromInstMap(Phi->getIncomingValue(ii)));
+        LLVM_DEBUG(dbgs() << "  giving " << A << "\n");
+      }
+    } else {
+      switch (GenXIntrinsic::getGenXIntrinsicID(WorkInst)) {
+        case GenXIntrinsic::genx_rdregioni:
+        case GenXIntrinsic::genx_rdregionf: {
+          // Handle the case of reading a scalar from element 0 of a vector, as
+          // a trunc from i32 to i16 is lowered to a bitcast to v2i16 then a
+          // rdregion.
+          Region R(WorkInst, BaleInfo());
+          if (!R.Indirect && !R.Offset)
+            A = getFromInstMap(WorkInst->getOperand(0));
+          else
+            A = Alignment(0, 0);
+          break;
+        }
+        case GenXIntrinsic::genx_constanti:
+          A = Alignment(cast<Constant>(WorkInst->getOperand(0)));
+          break;
+        case GenXIntrinsic::genx_convert:
+        case GenXIntrinsic::genx_convert_addr:
+          A = getFromInstMap(WorkInst->getOperand(0));
+          break;
+        case GenXIntrinsic::genx_add_addr: {
+          Alignment AA[2];
+          for (unsigned oi = 0, oe = WorkInst->getNumOperands(); oi != oe && oi < 2; ++oi)
+            AA[oi] = getFromInstMap(WorkInst->getOperand(oi));
+          if (!AA[0].isUncomputed() && !AA[1].isUncomputed())
+            A = AA[0].add(AA[1]);
+          else
+            A = Alignment(0, 0);
+          break;
+        }
+        case GenXIntrinsic::genx_ssmad:
+        case GenXIntrinsic::genx_uumad: {
+          A = Alignment(); // assume uncomputed
+          // every source operand should be computed or constant
+          Alignment SA[3];
+          for (unsigned oi = 0, oe = WorkInst->getNumOperands(); oi != oe && oi < 3; ++oi)
+            SA[oi] = getFromInstMap(WorkInst->getOperand(oi));
+          if (!SA[0].isUncomputed() && !SA[1].isUncomputed() && !SA[2].isUncomputed())
+            A = SA[0].mul(SA[1]).add(SA[2]);
+          else
+            A = Alignment(0, 0);
+          break;
+        }
+        default:
+          A = Alignment(0, 0); // no alignment info
+          break;
+      }
+    }
+    // See if the alignment has changed for WorkInst.
+    auto MapEntry = &InstMap[WorkInst];
+    if (*MapEntry == A)
+      continue; // no change
+    *MapEntry = A;
+    LLVM_DEBUG(dbgs() << "  " << WorkInst->getName() << " updated to " << A << "\n");
+    // Add all users that are in the original web to the worklist, if
+    // not already in the worklist.
+    for (auto ui = WorkInst->use_begin(), ue = WorkInst->use_end();
+        ui != ue; ++ui) {
+      auto user = cast<Instruction>(ui->getUser());
+      if (InstWebSet.find(user) != InstWebSet.end()
+          && WorkSet.insert(user).second)
+        InstWeb.push_back(user);
+    }
+  }
+  MapEntry = &InstMap[Inst];
+  assert(!MapEntry->isUncomputed());
+  LLVM_DEBUG(dbgs() << "AlignmentInfo::get: returning " << *MapEntry << "\n");
+  return *MapEntry;
+}
+
+/***********************************************************************
+ * Alignment constructor given literal value
+ */
+Alignment::Alignment(unsigned C)
+{
+  LogAlign = countTrailingZeros(C);
+  ExtraBits = 0;
+  ConstBits = (C < 0x7fffffff)? C : 0x7fffffff;
+}
+
+/***********************************************************************
+ * Alignment constructor given Constant
+ */
+Alignment::Alignment(Constant *C)
+{
+  setUncomputed();
+  if (isa<VectorType>(C->getType()))
+    C = C->getAggregateElement(0U);
+  if (isa<UndefValue>(C)) {
+    LogAlign = 31;
+    ExtraBits = 0;
+    ConstBits = 0x7fffffff;
+  } else if (auto CI = dyn_cast<ConstantInt>(C)) {
+    LogAlign = countTrailingZeros((unsigned)(CI->getSExtValue()));
+    ExtraBits = 0;
+    ConstBits = 0x7fffffff;
+    if (CI->getSExtValue() < 0x7fffffff && CI->getSExtValue() >= 0)
+      ConstBits = (unsigned)(CI->getSExtValue());
+  }
+}
+
+/***********************************************************************
+ * merge : merge two alignments
+ */
+Alignment Alignment::merge(Alignment Other) const
+{
+  // If either is uncomputed, result is the other one.
+  if (isUncomputed())
+    return Other;
+  if (Other.isUncomputed())
+    return *this;
+  // Take the minimum of the two logaligns, then chop off some more for
+  // disagreeing extrabits.
+  unsigned MinLogAlign = std::min(LogAlign, Other.LogAlign);
+  if (MinLogAlign) {
+    unsigned DisagreeExtraBits = (ExtraBits ^ Other.ExtraBits)
+      & ((1 << MinLogAlign) - 1);
+    MinLogAlign = std::min(MinLogAlign,
+      (unsigned)countTrailingZeros(DisagreeExtraBits, ZB_Width));
+  }
+  return Alignment(MinLogAlign, ExtraBits & ((1 << MinLogAlign) - 1));
+}
+
+/***********************************************************************
+ * merge : add two alignments
+ */
+Alignment Alignment::add(Alignment Other) const
+{
+  assert(!isUncomputed() && !Other.isUncomputed());
+  // Take the minimum of the two logaligns, then chop off some more for
+  // disagreeing extrabits.
+  unsigned MinLogAlign = std::min(LogAlign, Other.LogAlign);
+  unsigned ExtraBits2 = 0;
+  if (MinLogAlign) {
+    ExtraBits2 = (ExtraBits + Other.ExtraBits)
+      & ((1 << MinLogAlign) - 1);
+    MinLogAlign = std::min(MinLogAlign,
+      (unsigned)countTrailingZeros(ExtraBits2, ZB_Width));
+  }
+  return Alignment(MinLogAlign, ExtraBits2 & ((1 << MinLogAlign) - 1));
+}
+
+/***********************************************************************
+* merge : mul two alignments
+*/
+Alignment Alignment::mul(Alignment Other) const
+{
+  assert(!isUncomputed() && !Other.isUncomputed());
+  // Take the minimum of the two logaligns, then chop off some more for
+  // disagreeing extrabits.
+  unsigned MinLogAlign = std::min(LogAlign, Other.LogAlign);
+  if (ExtraBits == 0 && Other.ExtraBits == 0)
+    MinLogAlign = LogAlign + Other.LogAlign;
+  else if (ExtraBits == 0)
+    MinLogAlign = LogAlign;
+  else if (Other.ExtraBits == 0)
+    MinLogAlign = Other.LogAlign;
+  unsigned ExtraBits2 = 0;
+  if (MinLogAlign) {
+    ExtraBits2 = (ExtraBits * Other.ExtraBits)
+      & ((1 << MinLogAlign) - 1);
+    MinLogAlign = std::min(MinLogAlign,
+      (unsigned)countTrailingZeros(ExtraBits2, ZB_Width));
+  }
+  return Alignment(MinLogAlign, ExtraBits2 & ((1 << MinLogAlign) - 1));
+}
+
+/***********************************************************************
+ * getFromInstMap : get the alignment of a value, direct from InstMap if
+ * found else return Unknown, Alignment(0, 0)
+ */
+Alignment AlignmentInfo::getFromInstMap(Value *V)
+{
+  if (auto C = dyn_cast<Constant>(V))
+    return Alignment(C);
+  if (auto Inst = dyn_cast<Instruction>(V)) {
+    return InstMap[V];
+  }
+  return Alignment::getUnknown();
+}
+
+/***********************************************************************
+ * Alignment debug dump/print
+ */
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Alignment::dump() const
+{
+  errs() << *this << "\n";
+}
+#endif
+
+void Alignment::print(raw_ostream &OS) const
+{
+  if (isUncomputed())
+    OS << "uncomputed";
+  else if (isUnknown())
+    OS << "unknown";
+  else if (isConstant())
+    OS << "const=" << ConstBits;
+  else
+    OS << "n<<" << LogAlign << "+" << ExtraBits;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.h
new file mode 100644
index 000000000000..3b714126bdef
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.h
@@ -0,0 +1,154 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// genx::AlignmentInfo : alignment information
+/// -------------------------------------------
+/// 
+/// AlignmentInfo is a cache of information on the alignment of instruction
+/// values in a function. It does not persist between passes.
+///
+/// A pass that needs alignment information constructs an AlignmentInfo at 
+/// the start of the pass, and then calls the ``get`` method each time it wants
+/// alignment information for a particular instruction value. AlignmentInfo 
+/// calculates it if it is not already in its cache, which probably involves 
+/// also calculating the alignment of other instructions that the given one 
+/// depends on.
+///
+/// This cacheing and lazy calculation is done instead of having a separate analysis
+/// pass because alignment is needed for only a small subset of values in a function.
+///
+/// The alignment is returned as an *Alignment* object with three fields:
+/// *ConstBits*, if ConstBits is not 0x7fffffff, alignment is a known bit-pattern,
+/// otherwise *LogAlign* and *ExtraBits* (where 0 <= ExtraBits < (1 << LogAlign)),
+/// stating that the value is known to be A << LogAlign | ExtraBits for some A.
+///
+/// For a vector value, the alignment information is for element 0.
+///
+/// The calculation uses a worklist algorithm that can cope with phi nodes and
+/// loops. So, for example, a variable (used as an indirect region index) that
+/// starts at 10 then is incremented by 8 inside a loop is correctly calculated
+/// to be 8A+2 for some A.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef GENXALIGNMENTINFO_H
+#define GENXALIGNMENTINFO_H
+
+#include "GenX.h"
+#include "IgnoreRAUWValueMap.h"
+
+namespace llvm {
+  class raw_ostream;
+
+namespace genx {
+
+// Alignment : the alignment of a value
+class Alignment {
+  unsigned LogAlign;
+  unsigned ExtraBits;
+  unsigned ConstBits;
+public:
+  // No-arg constructor sets to uncomputed state.
+  Alignment() { setUncomputed(); }
+  // Constructor given LogAlign and ExtraBits fields.
+  Alignment(unsigned LogAlign, unsigned ExtraBits)
+  : LogAlign(LogAlign), ExtraBits(ExtraBits), ConstBits(0x7fffffff) {}
+  // Constructor given literal value.
+  Alignment(unsigned C);
+  // Constructor given Constant.
+  Alignment(Constant *C);
+  // Copy-constructor
+  Alignment(const Alignment& Rhs) {
+    LogAlign = Rhs.LogAlign;
+    ExtraBits = Rhs.ExtraBits;
+    ConstBits = Rhs.ConstBits;
+  }
+  // Copy-operator
+  Alignment& operator=(const Alignment &Rhs) {
+    LogAlign = Rhs.LogAlign;
+    ExtraBits = Rhs.ExtraBits;
+    ConstBits = Rhs.ConstBits;
+    return *this;
+  }
+
+  // Get an unknown alignment
+  static Alignment getUnknown() { return Alignment(0, 0); }
+  // Merge two Alignments
+  Alignment merge(Alignment Other) const;
+  // Add one Alignment with another Alignment
+  Alignment add(Alignment Other) const;
+  // Mul one Alignment with another Alignment
+  Alignment mul(Alignment Other) const;
+
+  // accessors
+  bool isUncomputed() const { return LogAlign == 0xffffffff; }
+  bool isUnknown() const { return LogAlign == 0 && ConstBits == 0x7fffffff; }
+  bool isConstant() const { return !isUncomputed() && ConstBits != 0x7fffffff; }
+  unsigned getLogAlign() const { assert(!isUncomputed()); return LogAlign; }
+  unsigned getExtraBits() const { assert(!isUncomputed()); return ExtraBits; }
+  int64_t getConstBits() const { assert(isConstant()); return ConstBits; }
+  // comparison
+  bool operator==(const Alignment &Rhs) const {
+    return (LogAlign == Rhs.LogAlign &&
+            ExtraBits == Rhs.ExtraBits &&
+            ConstBits == Rhs.ConstBits);
+  }
+  // Debug dump/print
+  void dump() const;
+  void print(raw_ostream &OS) const;
+private:
+  void setUncomputed() {
+    LogAlign = 0xffffffff;
+    ExtraBits = 0;
+    ConstBits = 0x7fffffff;
+  }
+};
+
+// AlignmentInfo : cache of alignment of instructions in a function
+class AlignmentInfo {
+  ValueMap<const Value *, Alignment,
+          IgnoreRAUWValueMapConfig<const Value *>> InstMap;
+public:
+  // AlignmentInfo constructor
+  AlignmentInfo() {}
+  // Clear the cache of value alignments
+  void clear() { InstMap.clear(); }
+  // get the alignment of a Value
+  Alignment get(Value *V);
+public:
+  // return an Alignment for a value
+  Alignment getFromInstMap(Value *V);
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const Alignment &A) {
+  A.print(OS);
+  return OS;
+}
+
+} // end namespace genx
+} // end namespace llvm
+
+#endif /* GENXALIGNMENTINFO_H */
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAnalysisDumper.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAnalysisDumper.cpp
new file mode 100644
index 000000000000..a221ea459256
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAnalysisDumper.cpp
@@ -0,0 +1,144 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// GenXAnalysisDumper is a pass that calls the print() method on a function
+// pass to dump its state out to a file.
+// GenXGroupAnalysisDumper is the same, but for a function group pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "vc/GenXOpts/Utils/KernelInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// GenXAnalysisDumper : a pass to dump an analysis to a file
+class GenXAnalysisDumper : public FunctionPass {
+  FunctionPass *P;
+  const char *Suffix;
+public:
+  static char ID;
+  explicit GenXAnalysisDumper(FunctionPass *P, const char *Suffix)
+    : FunctionPass(ID), P(P), Suffix(Suffix) { }
+  virtual StringRef getPassName() const { return "GenX analysis dumper pass"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+  bool runOnFunction(Function &F);
+};
+
+// GenXGroupAnalysisDumper : a pass to dump an analysis to a file
+class GenXGroupAnalysisDumper : public FunctionGroupPass {
+  FunctionGroupPass *P;
+  const char *Suffix;
+public:
+  static char ID;
+  explicit GenXGroupAnalysisDumper(FunctionGroupPass *P, const char *Suffix)
+    : FunctionGroupPass(ID), P(P), Suffix(Suffix) { }
+  virtual StringRef getPassName() const { return "GenX analysis dumper pass"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    FunctionGroupPass::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+  bool runOnFunctionGroup(FunctionGroup &FG);
+};
+
+} // end anonymous namespace
+
+char GenXAnalysisDumper::ID = 0;
+
+FunctionPass *llvm::createGenXAnalysisDumperPass(
+    FunctionPass *P, const char *Suffix)
+{
+  return new GenXAnalysisDumper(P, Suffix);
+}
+
+char GenXGroupAnalysisDumper::ID = 0;
+
+FunctionGroupPass *llvm::createGenXGroupAnalysisDumperPass(
+    FunctionGroupPass *P, const char *Suffix)
+{
+  return new GenXGroupAnalysisDumper(P, Suffix);
+}
+
+/***********************************************************************
+ * openFileForDump : open file for dumping analysis into
+ *
+ * The filename is the name of the kernel, or the name of the function if
+ * not a kernel, with the supplied suffix.
+ *
+ * On error, this function prints an error message and returns -1.
+ */
+static int openFileForDump(Function *F, StringRef Suffix)
+{
+  // Get name of kernel, or failing that, name of function.
+  KernelMetadata KM(F);
+  StringRef Name = KM.getName();
+  if (Name.empty())
+    Name = F->getName();
+  int FD = -1;
+  std::string Filename = (Name + Suffix).str();
+  // Sanitize templated kernel names.
+  std::replace_if(Filename.begin(), Filename.end(),
+                  [](const char x) { return x == '<' || x == '>'; }, '_');
+  auto EC = sys::fs::openFileForWrite(Filename, FD, sys::fs::CD_CreateAlways, sys::fs::F_None);
+  if (EC) {
+    errs() << "Error: " << EC.message() << "\n";
+    return -1;
+  }
+  return FD;
+}
+
+/***********************************************************************
+ * GenXAnalysisDumper::runOnFunction : dump analysis to file
+ */
+bool GenXAnalysisDumper::runOnFunction(Function &F)
+{
+  int FD = openFileForDump(&F, Suffix);
+  raw_fd_ostream O(FD, /*shouldClose=*/ true);
+  P->print(O, F.getParent());
+  return false;
+}
+
+/***********************************************************************
+ * GenXGroupAnalysisDumper::runOnFunctionGroup : dump analysis to file
+ */
+bool GenXGroupAnalysisDumper::runOnFunctionGroup(FunctionGroup &FG)
+{
+  int FD = openFileForDump(FG.getHead(), Suffix);
+  raw_fd_ostream O(FD, /*shouldClose=*/ true);
+  P->print(O, FG.getHead()->getParent());
+  return false;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXArgIndirection.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXArgIndirection.cpp
new file mode 100644
index 000000000000..0509ca976013
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXArgIndirection.cpp
@@ -0,0 +1,1822 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXArgIndirection
+/// ------------------
+///
+/// The GenXArgIndirection pass runs very late, after coalescing and address
+/// commoning, to change arguments and return values that were originally by ref
+/// to use address registers. This saves copies and register pressure.
+///
+/// Recall that, very early on in CMABI, a by ref argument is transformed into
+/// copy-in copy-out semantics.
+///
+/// This pass is run very late on for two reasons:
+///
+/// 1. There is no convenient way to represent passing an argument using an
+///    address register in LLVM IR. We don't want to pretend that the address
+///    register is a pointer, and the GRF is an area of memory, as that would
+///    stop us using Values to represent registers normally, and so would stop
+///    us using lots of LLVM optimizations.
+///
+///    Running the pass this late means that the IR afterwards does not have to
+///    strictly represent the semantics, as nothing else happens to it before
+///    generating the output code. So uses and defs of the indirected argument
+///    (and other Values coalesced with it) still use the same Values, but that
+///    live range has no register allocated (it is category NONE), and all
+///    accesses are indirected.  We rely on the LLVM IR together with the
+///    liveness information representing the code well enough for register
+///    allocation and code generation to work.
+///
+/// 2. We cannot tell whether we want to perform this transformation until we can
+///    see how Values have coalesced.
+///
+/// Action of GenXArgIndirection
+/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+///
+/// An argument for a subroutine call can generate a bunch of mov instructions in
+/// two circumstances:
+///
+/// 1. Coalescing failed to coalesce this call argument, so the argument in the
+///    caller and the argument in the subroutine are in different registers
+///    (different coalesced live ranges). In this case, GenXCoalescing has to
+///    generate a sequence of baled together rdregion-wrregion intrinsic pairs,
+///    each generating a mov instruction, to copy the value.
+///
+/// 2. The argument was originally a by ref CM select(), so is an rdregion,
+///    legalized into a sequence of baled together rdregion-wrregion pairs.
+///
+/// The argument indirection pass attempts to spot these cases. The regions at
+/// each call site must be similar (same region parameters except start index)
+/// and contiguous.
+///
+/// The pass modifies each call to pass an address register into the subroutine
+/// as an extra argument, using it to indirecting all accesses to the subroutine
+/// argument and other Values coalesced with it. It then removes the rd-wr
+/// sequence so it does not generate any code.
+///
+/// Indirecting all accesses to the subroutine argument is only possible if each
+/// one would be legal as an indirect region. The pass uses the
+/// hasIndirectGRFCrossing feature from GenXSubtarget to tell whether it would
+/// be legal. The optimization can fail for this reason, and that is more common
+/// on pre-SKL where there is no indirect region GRF crossing.
+///
+/// The pass deals with one subroutine argument in one subroutine at a time. It
+/// looks at all call sites to see if there is anything that stops this
+/// transformation happening at all, and whether there is any call site that
+/// would benefit from the transformation.
+///
+/// Coalesced return value
+/// """"""""""""""""""""""
+///
+/// If the subroutine argument is coalesced with a return value from the call,
+/// then argument indirection can succeed only if the return value at each call
+/// site is written (similarly using a rd-wr sequence) to exactly the same
+/// region in a vector that is coalesced (so same register) with the input
+/// vector to the rd-wr sequence for the argument.
+///
+/// No coalesced return value
+/// """""""""""""""""""""""""
+///
+/// If the subroutine argument is _not_ coalesced with a return value from the
+/// call, so only the arg could be indirected, indirection can only occur if one
+/// of these conditions is met:
+///
+/// 1. the live range being indirected is not live over the call (so it does not
+///    matter if the subroutine writes to the same register), or
+///
+/// 2. the subroutine does not write to the same register (i.e. there are no defs
+///    in the subroutine arg's live range other than args and coalesced
+///    bitcasts).
+///
+/// Constant argument and rd-wr sequence return value
+/// """""""""""""""""""""""""""""""""""""""""""""""""
+///
+/// Where the original source initializes a big vector or matrix to constant and
+/// then calls a subroutine passing the vector by ref, the IR that this pass sees
+/// is that the argument passed to the call is constant, and the rd-wr sequence
+/// for the return value has an "old value" input that is another constant
+/// (including undef).
+///
+/// GenXArgIndirection spots this case, and transforms the code to load the
+/// combination of the two constants before the call and pass an address register
+/// set to the appropriate point.
+///
+/// Indirection of subroutine
+/// """""""""""""""""""""""""
+///
+/// If an argument is being indirected, all references to that register
+/// (coalesced live range) inside the subroutine and everything it calls must be
+/// indirected.
+///
+/// GenXArgIndirection does not include the facility to split up a bale if it
+/// would become illegal when indirected. This is only a problem in BDW and
+/// earlier, where an indirect region is not allowed to cross even one GRF
+/// boundary. If it sees an access with a region that would become illegal if
+/// indirected, it abandons indirection of that argument.
+///
+/// Warning messages
+/// ^^^^^^^^^^^^^^^^
+///
+/// Where GenXArgIndirection sees a suitably large uncoalesced call arg that
+/// would benefit from arg indirection, but it fails to satisfy the criteria,
+/// the pass outputs a warning message. The idea is that the CM programmer
+/// might consider some changes to his/her kernel to optimize it.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_ARGINDIRECTION"
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXAlignmentInfo.h"
+#include "GenXBaling.h"
+#include "GenXConstants.h"
+#include "GenXIntrinsics.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "GenXNumbering.h"
+#include "GenXRegion.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "vc/GenXOpts/Utils/RegCategory.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/GenXIntrinsics/GenXMetadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+
+static cl::opt<unsigned> LimitGenXArgIndirection("limit-genx-arg-indirection", cl::init(UINT_MAX), cl::Hidden,
+                                      cl::desc("Limit GenX argument indirection."));
+
+
+namespace {
+
+class GenXArgIndirection;
+class SubroutineArg;
+
+
+// Diagnostic information for error/warning relating arg indirection.
+class DiagnosticInfoArgIndirection : public DiagnosticInfo {
+private:
+  std::string Description;
+  StringRef Filename;
+  unsigned Line;
+  unsigned Col;
+  static int KindID;
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+public:
+  // Initialize from an Instruction and an Argument.
+  DiagnosticInfoArgIndirection(Instruction *Inst, Argument *Arg,
+      const Twine &Desc, DiagnosticSeverity Severity = DS_Error);
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+int DiagnosticInfoArgIndirection::KindID = 0;
+
+// processArgLR relies on these being in this order.
+// checkIndirectability relies on these being powers of 2 (except
+// CALLER_INDIRECTING being 0)
+enum Indirectability {
+  CALLER_INDIRECTING = 0,
+  NO_OPTIMIZATION = 1,
+  WANT_INDIRECTION = 2,
+  WANT_SOME_INDIRECTION = 4,
+  CANNOT_INDIRECT = 8
+};
+
+// A call site and the action that we want to take when indirecting the arg.
+// This is then subclassed by the *CallSite classes below.
+class CallSite {
+public:
+  CallInst *CI;
+protected:
+  Indirectability State;
+  Value *Index;
+public:
+  CallSite(CallInst *CI, Indirectability State, Value *Index)
+      : CI(CI), State(State), Index(Index) {}
+  virtual ~CallSite() {}
+  Indirectability getState() const { return State; }
+  Value *getIndex() const { return Index; }
+  virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg) = 0;
+  virtual void printImpl(raw_ostream &OS) const = 0;
+  void print(raw_ostream &OS) const { printImpl(OS); }
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const CallSite &CS) {
+  CS.print(OS); return OS;
+}
+
+// A call site in a subroutine that is itself indirecting the arg.
+class CallerIndirectingCallSite : public CallSite {
+  SubroutineArg *CallerSubrArg;
+public:
+  CallerIndirectingCallSite(CallInst *CI, SubroutineArg *CallerSubrArg)
+      : CallSite(CI, Indirectability::CALLER_INDIRECTING, nullptr),
+        CallerSubrArg(CallerSubrArg) {}
+  virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg);
+  virtual void printImpl(raw_ostream &OS) const {
+    OS << "CallerIndirectingCallSite " << *CI;
+  }
+};
+
+// A call site where indirecting the arg does not give any optimization because
+// we did not find copies or rd/wr regions that we can get rid of. We can still
+// indirect it though if other call sites do get an optimization.
+class NoOptCallSite : public CallSite {
+public:
+  NoOptCallSite(CallInst *CI)
+      : CallSite(CI, Indirectability::NO_OPTIMIZATION, nullptr) {}
+  virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg);
+  virtual void printImpl(raw_ostream &OS) const {
+    OS << "NoOptCallSite " << *CI;
+  }
+};
+
+// A call site where the arg is constant (including undef) and the arg is
+// coalesced with a retval that is used only in a legalized wrregion
+// whose "old value" input is constant.
+class ConstArgRetCallSite : public CallSite {
+  Constant *LdConst; // the constant that needs to be loaded
+  AssertingVH<Instruction> RetEndWr; // the last wrregion in the sequence for the retval
+public:
+  ConstArgRetCallSite(CallInst *CI, Constant *LdConst, Instruction *RetEndWr,
+                      Value *Index)
+      : CallSite(CI, Indirectability::WANT_INDIRECTION, Index),
+        LdConst(LdConst), RetEndWr(RetEndWr) {}
+  virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg);
+  virtual void printImpl(raw_ostream &OS) const {
+    OS << "ConstArgRetCallSite " << *CI << "\n    LdConst " << *LdConst
+       << " \n    RetEndWr " << *RetEndWr << "\n    Index " << *Index;
+  }
+};
+
+// A call site where the arg is a legalized rdregion or copy, and there is no
+// retval coalesced with it.
+class IndirectArgCallSite : public CallSite {
+protected:
+  // Some use of input (arg or inst) in legalized rdregion or copy. This is
+  // kept as a Use * rather than the value it actually uses to allow for the
+  // case that the value is something that will be replaced and erased by
+  // another call site processing the same ArgLR.
+  Use *InputUse;
+public:
+  IndirectArgCallSite(CallInst *CI, Use *InputUse, Value *Index)
+      : CallSite(CI, Indirectability::WANT_INDIRECTION, Index),
+        InputUse(InputUse) {}
+  virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg);
+  virtual void printImpl(raw_ostream &OS) const {
+    OS << "IndirectArgCallSite " << *CI << "\n    Input " << (*InputUse)->getName()
+       << " Index " << *Index;
+  }
+};
+
+// A call site where the arg is a legalized rdregion or copy, and the arg is
+// coalesced with a retval that is used only in a legalized wrregion or copy.
+class IndirectArgRetCallSite : public IndirectArgCallSite {
+  AssertingVH<Instruction> RetEndWr; // the last wrregion in the sequence for the retval
+public:
+  IndirectArgRetCallSite(CallInst *CI, Use *InputUse, Instruction *RetEndWr,
+      Value *Index) : IndirectArgCallSite(CI, InputUse, Index), RetEndWr(RetEndWr)
+  {}
+  virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg);
+  virtual void printImpl(raw_ostream &OS) const {
+    OS << "IndirectArgRetCallSite " << *CI << "\n    Input " << (*InputUse)->getName()
+       << " RetEndWr " << RetEndWr->getName() << " Index " << *Index;
+  }
+};
+
+
+class GenXArgIndirection;
+
+// A subroutine arg that we might want to indirect
+class SubroutineArg {
+  GenXArgIndirection *Pass;
+public:
+  LiveRange *ArgLR;
+  Argument *Arg;
+private:
+  int CoalescedRetIdx;
+  bool CanCoalesceWithoutKill;
+  SmallVector<CallSite *, 4> CallSites;
+  Alignment Align;
+  Function *F;
+  Function *NewFunc;
+public:
+  Argument *AddressArgument;
+  SubroutineArg(GenXArgIndirection *Pass, LiveRange *ArgLR, Argument *Arg)
+    : Pass(Pass), ArgLR(ArgLR), Arg(Arg), F(Arg->getParent()), NewFunc(nullptr) {}
+  ~SubroutineArg() {
+    for (auto i = CallSites.begin(), e = CallSites.end(); i != e; ++i)
+      delete *i;
+  }
+  Indirectability checkIndirectability();
+  CallSite *createCallSite(CallInst *CI);
+  Alignment getIndirectAlignment() const;
+  void gatherBalesToModify(Alignment Align);
+  void addAddressArg();
+  void fixCallSites();
+  void coalesceAddressArgs();
+  void replaceFunction();
+private:
+  static Value *getRetVal(CallInst *CI, unsigned RetNum);
+};
+
+// GenX arg indirection pass
+class GenXArgIndirection : public FunctionGroupPass {
+  friend CallSite;
+  friend SubroutineArg;
+  friend NoOptCallSite;
+  friend ConstArgRetCallSite;
+  friend IndirectArgCallSite;
+  friend IndirectArgRetCallSite;
+private:
+  FunctionGroup *FG;
+  FunctionGroupAnalysis *FGA;
+  GenXBaling *Baling;
+  GenXLiveness *Liveness;
+  GenXNumbering *Numbering;
+  AlignmentInfo *AI;
+  const GenXSubtarget *ST;
+  // List of arg live ranges to consider.
+  SmallVector<LiveRange *, 4> ArgLRs;
+  // For the ArgLR being processed:
+  // List of subroutine args in the ArgLR.
+  SmallVector<SubroutineArg, 4> SubrArgs;
+  // Bales that need modifying for indirection.
+  SmallVector<Instruction *, 4> BalesToModify;
+  // Map from function back to the SubroutineArg for it.
+  std::map<Function *, SubroutineArg *> FuncMap;
+  // List of LRs that we need to recalculate.
+  SmallVector<LiveRange *, 4> LRsToCalculate;
+public:
+  static char ID;
+  explicit GenXArgIndirection() : FunctionGroupPass(ID) { }
+  virtual StringRef getPassName() const { return "GenX arg indirection"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    FunctionGroupPass::getAnalysisUsage(AU);
+    AU.addRequired<FunctionGroupAnalysis>();
+    AU.addRequired<GenXNumbering>();
+    AU.addRequired<GenXLiveness>();
+    AU.addRequired<GenXGroupBaling>();
+    AU.addRequired<GenXModule>();
+    AU.addPreserved<DominatorTreeGroupWrapperPass>();
+    AU.addPreserved<GenXGroupBaling>();
+    AU.addPreserved<GenXLiveness>();
+    AU.addPreserved<GenXNumbering>();
+    AU.addPreserved<GenXModule>();
+    AU.addPreserved<FunctionGroupAnalysis>();
+    AU.setPreservesCFG();
+  }
+  bool runOnFunctionGroup(FunctionGroup &FG);
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const
+  { return createGenXGroupPrinterPass(O, Banner); }
+private:
+  void gatherArgLRs();
+  bool processArgLR(LiveRange *ArgLR);
+  bool gatherBalesToModify(LiveRange *ArgLR, Alignment Align);
+  bool checkIndirectBale(Bale *B, LiveRange *ArgLR, Alignment Align);
+  void indirectBale(Bale *B, LiveRange *ArgLR, Argument *AddressArg);
+  void indirectRegion(Use *U, Value *AddressArg, Instruction *InsertBefore);
+  static Argument *getArgForFunction(LiveRange *LR, Function *F);
+  void replaceAndEraseSequence(Instruction *RetEndWr, Value *V);
+};
+
+} // end anonymous namespace
+
+char GenXArgIndirection::ID = 0;
+namespace llvm { void initializeGenXArgIndirectionPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXArgIndirection, "GenXArgIndirection", "GenXArgIndirection", false, false)
+INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling)
+INITIALIZE_PASS_DEPENDENCY(GenXNumbering)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_END(GenXArgIndirection, "GenXArgIndirection", "GenXArgIndirection", false, false)
+
+FunctionGroupPass *llvm::createGenXArgIndirectionPass()
+{
+  initializeGenXArgIndirectionPass(*PassRegistry::getPassRegistry());
+  return new GenXArgIndirection();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run the coalescing pass for this FunctionGroup
+ */
+bool GenXArgIndirection::runOnFunctionGroup(FunctionGroup &ArgFG)
+{
+  FG = &ArgFG;
+  unsigned Modified = 0;
+  // Get analyses that we use and/or modify.
+  FGA = &getAnalysis<FunctionGroupAnalysis>();
+  Baling = &getAnalysis<GenXGroupBaling>();
+  Numbering = &getAnalysis<GenXNumbering>();
+  Liveness = &getAnalysis<GenXLiveness>();
+  AI = new AlignmentInfo;
+  ST = getAnalysis<GenXModule>().getSubtarget();
+  // Gather list of LRs containing an arg that we want to consider. (Two
+  // args might be coalesced together, so we consider a whole arg-containing
+  // LR at a time.)
+  gatherArgLRs();
+  // Process them.
+  for (auto i = ArgLRs.begin(), e = ArgLRs.end();
+      i != e && Modified < LimitGenXArgIndirection; ++i) {
+    if (processArgLR(*i)) {
+      ++Modified;
+      if (LimitGenXArgIndirection != UINT_MAX)
+        dbgs() << "genx-arg-indirection " << Modified << "\n";
+    }
+  }
+  ArgLRs.clear();
+  SubrArgs.clear();
+  BalesToModify.clear();
+  FuncMap.clear();
+  LRsToCalculate.clear();
+  delete AI;
+  return Modified != 0;
+}
+
+/***********************************************************************
+ * gatherArgLRs : gather a list of LRs containing an arg that we want to
+ *    consider
+ */
+void GenXArgIndirection::gatherArgLRs()
+{
+  std::set<LiveRange *> Seen;
+  // For a kernel arg, add it to Seen but not to the list, so it will not get
+  // added to the list. We cannot indirect a kernel arg.
+  for (auto ai = FG->at(0)->arg_begin(), ae = FG->at(0)->arg_end();
+      ai != ae; ++ai)
+    Seen.insert(Liveness->getLiveRange(&*ai));
+  // For a subroutine arg, add its LR to the list if it is not already in Seen.
+  for (auto fgi = FG->begin() + 1, fge = FG->end(); fgi != fge; ++fgi) {
+    if ((*fgi)->hasFnAttribute("referenced-indirectly"))
+      continue;
+    for (auto ai = (*fgi)->arg_begin(), ae = (*fgi)->arg_end(); ai != ae; ++ai) {
+      Argument *Arg = &*ai;
+      // Only process an arg that is bigger than 2 GRFs.
+      if (Arg->getType()->getPrimitiveSizeInBits() <= ST->getGRFWidth() * 16)
+        continue;
+      LiveRange *LR = Liveness->getLiveRange(Arg);
+      if (Seen.insert(LR).second)
+        ArgLRs.push_back(LR);
+    }
+  }
+}
+
+/***********************************************************************
+ * processArgLR : process one live range containing at least one subroutine arg
+ *
+ * Return:  true = some modifications made
+ */
+bool GenXArgIndirection::processArgLR(LiveRange *ArgLR)
+{
+  // Get a list of args in this live range.
+  SubrArgs.clear();
+  FuncMap.clear();
+  LLVM_DEBUG(dbgs() << "processArgLR: " << *ArgLR << "\n");
+  for (auto vi = ArgLR->value_begin(), ve = ArgLR->value_end(); vi != ve; ++vi)
+    if (auto Arg = dyn_cast<Argument>(vi->getValue())) {
+      SubrArgs.push_back(SubroutineArg(this, ArgLR, Arg));
+      FuncMap[Arg->getParent()] = &SubrArgs.back();
+    }
+  // For each arg, see if we can or want to indirect.
+  Indirectability Res = Indirectability::NO_OPTIMIZATION;
+  for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end();
+      SubrArg != e; ++SubrArg) {
+    LLVM_DEBUG(dbgs() << " checkIndirectability on arg " << SubrArg->Arg->getArgNo()
+        << " (" << (SubrArg->Arg->getType()->getPrimitiveSizeInBits() / 8U)
+        << " bytes) in " << SubrArg->Arg->getParent()->getName() << "\n");
+    Res = std::max(Res, SubrArg->checkIndirectability());
+  }
+  if (Res == Indirectability::NO_OPTIMIZATION) {
+    LLVM_DEBUG(dbgs() << "NO_OPTIMIZATION\n");
+    return false; // no indirection needed
+  }
+  if (Res == Indirectability::CANNOT_INDIRECT) {
+    LLVM_DEBUG(dbgs() << "CANNOT_INDIRECT\n");
+    return false; // cannot indirect this ArgLR
+  }
+  // Get the worst case alignment of the indices from the call sites if we
+  // indirect this arg.
+  Alignment Align = Alignment(5, 0);
+  for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end();
+      SubrArg != e; ++SubrArg) {
+    auto ThisAlign = SubrArg->getIndirectAlignment();
+    Align = Align.merge(ThisAlign);
+  }
+  // Gather the bales that need indirecting, and check whether indirection is
+  // possible.
+  if (!gatherBalesToModify(ArgLR, Align))
+    return false;
+  LLVM_DEBUG(dbgs() << "GenXArgIndirection is going to indirect " << *ArgLR << "\n");
+  LRsToCalculate.clear();
+  if (Res == Indirectability::WANT_SOME_INDIRECTION) {
+    // The arg that we're indirecting is coalesced at some call site where we
+    // are going to indirect it (represented by a NoOptCallSite). To avoid the
+    // coalesced LR also being live at other call sites where the arg is in
+    // fact in some other register, we need to uncoalesce. We take the values
+    // in ArgLR and separate into two piles: one defined outside subroutines
+    // where ArgLR has an arg, and one defined inside such subroutines. Then
+    // the two piles get a live range each, and the latter one is marked as not
+    // needing a register allocating.
+    SmallVector<SimpleValue, 4> OutsidePile;
+    SmallVector<SimpleValue, 4> InsidePile;
+    for (auto vi = ArgLR->value_begin(), ve = ArgLR->value_end();
+        vi != ve; ++vi) {
+      auto SV = *vi;
+      Function *ContainingFunc = Liveness->isUnifiedRet(SV.getValue());
+      if (!ContainingFunc) {
+        if (auto VArg = dyn_cast<Argument>(SV.getValue()))
+          ContainingFunc = VArg->getParent();
+        else
+          ContainingFunc = cast<Instruction>(SV.getValue())
+              ->getParent()->getParent();
+      }
+      if (!FuncMap[ContainingFunc])
+        OutsidePile.push_back(SV);
+      else
+        InsidePile.push_back(SV);
+    }
+    assert(!InsidePile.empty());
+    if (!OutsidePile.empty()) {
+      Liveness->removeValuesNoDelete(ArgLR);
+      LiveRange *OutsideLR = Liveness->getOrCreateLiveRange(OutsidePile[0]);
+      OutsideLR->setCategory(ArgLR->getCategory());
+      for (auto vi = OutsidePile.begin() + 1, ve = OutsidePile.end();
+          vi != ve; ++vi)
+        Liveness->setLiveRange(*vi, OutsideLR);
+      for (auto vi = InsidePile.begin(), ve = InsidePile.end();
+          vi != ve; ++vi)
+        Liveness->setLiveRange(*vi, ArgLR);
+      LLVM_DEBUG(dbgs() << " Uncoalesced ArgLR into " << *OutsideLR
+           << "\n                    and " << *ArgLR << "\n");
+      LRsToCalculate.push_back(OutsideLR);
+    }
+  }
+  // ArgLR now contains only these values:
+  //  - args that we are indirecting
+  //  - other values inside the subroutines that we are indirecting
+  // We do not want it to get a register allocated, since those values will be
+  // indirected. We achieve that by setting ArgLR's category to NONE.
+  ArgLR->setCategory(RegCategory::NONE);
+  LLVM_DEBUG(dbgs() << " Not allocating register for arg's LR\n");
+  // For each subroutine, replace the func with a new one that has an extra
+  // address arg.
+  for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end();
+      SubrArg != e; ++SubrArg)
+    SubrArg->addAddressArg();
+  // For each subroutine, fix up its call sites.
+  for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end();
+      SubrArg != e; ++SubrArg)
+    SubrArg->fixCallSites();
+  // Replace old function with new function.
+  for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end();
+      SubrArg != e; ++SubrArg)
+    SubrArg->replaceFunction();
+  // Run gatherBalesToModify again, as the list it made last time is now invalid
+  // due to code being changed.
+  if (!gatherBalesToModify(ArgLR, Align))
+    llvm_unreachable("not expecting indirection to have become invalid in second run");
+  // Indirect the bales.
+  for (auto bi = BalesToModify.begin(), be = BalesToModify.end();
+      bi != be; ++bi) {
+    Instruction *Inst = *bi;
+    Bale B;
+    Baling->buildBale(Inst, &B);
+    auto argIter = Inst->getParent()->getParent()->arg_begin();
+    std::advance(argIter, Inst->getParent()->getParent()->arg_size() - 1);
+    Argument *AddressArg = &*argIter;
+    indirectBale(&B, ArgLR, AddressArg);
+  }
+  // Recalculate live ranges as required. Rebuild the call graph first, as it
+  // has been made invalid by us replacing some functions.
+  {
+    Liveness->rebuildCallGraph();
+    std::set<LiveRange *> LRsSeen;
+    for (auto i = LRsToCalculate.begin(), e = LRsToCalculate.end(); i != e; ++i) {
+      LiveRange *LR = *i;
+      if (LRsSeen.insert(LR).second) {
+        Liveness->rebuildLiveRange(LR);
+        LLVM_DEBUG(dbgs() << " recalculated " << *LR << "\n");
+      }
+    }
+  }
+  // Coalesce (or insert copy on coalesce failure) new address args.
+  for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end();
+      SubrArg != e; ++SubrArg)
+    SubrArg->coalesceAddressArgs();
+  return true;
+}
+
+/***********************************************************************
+ * checkIndirectability : check whether we want to and can indirect a
+ *    subroutine argument, populating the SubrArg struct so we have the
+ *    information needed to indirect it
+ *
+ * Return:  NO_OPTIMIZATION : can indirect, but no optimization in terms of
+ *                saving instructions or register pressure
+ *          WANT_INDIRECTION : can indirect and it is an optimization. The live
+ *                range does not include anything outside of subroutines where
+ *                it is an arg, thus we need to ensure that no register is
+ *                allocated to it.
+ *          WANT_SOME_INDIRECTION : can indirect and it is an optimization. The
+ *                live range does include something outside of subroutines where
+ *                it is an arg, so we need to ensure that a register is allocated
+ *                to it. We get this if some call sites are WANT_INDIRECTION and
+ *                some are NO_OPTIMIZATION.
+ *          CANNOT_INDIRECT : cannot indirect this live range at all.
+ */
+Indirectability SubroutineArg::checkIndirectability()
+{
+  if (F->hasFnAttribute(genx::FunctionMD::CMStackCall))
+    return CANNOT_INDIRECT;
+  // See if there is a return value that is coalesced with the arg.
+  CoalescedRetIdx = -1;
+  for (unsigned ri = 0, re = IndexFlattener::getNumElements(F->getReturnType());
+      ri != re; ++ri) {
+    if (Pass->Liveness->getLiveRange(
+          SimpleValue(Pass->Liveness->getUnifiedRet(F), ri)) == ArgLR) {
+      if (CoalescedRetIdx >= 0) {
+        for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) {
+          auto CI = cast<CallInst>(ui->getUser());
+          DiagnosticInfoArgIndirection Warn(CI, Arg,
+              "Argument coalesced with multiple return values", DS_Warning);
+          CI->getContext().diagnose(Warn);
+        }
+        return Indirectability::CANNOT_INDIRECT;
+      }
+      CoalescedRetIdx = ri;
+      break;
+    }
+  }
+  // If there is no return value, check whether it is OK to indirect a call arg
+  // even if the call arg is not killed at the call. This is the case if there
+  // is no write to the subroutine arg's live range inside the subroutine(s)
+  // other than args and coalesced bitcasts.
+  CanCoalesceWithoutKill = true;
+  if (CoalescedRetIdx < 0) {
+    for (auto vi = ArgLR->value_begin(), ve = ArgLR->value_end(); vi != ve; ++vi) {
+      auto Inst = dyn_cast<Instruction>(vi->getValue());
+      if (!Inst)
+        continue; // it's an arg, not an instruction
+      Function *Func = Pass->Liveness->isUnifiedRet(Inst);
+      if (!Func)
+        Func = Inst->getParent()->getParent();
+      else
+        continue;
+      if (Pass->FuncMap.find(Func) == Pass->FuncMap.end())
+        continue; // value not in one of the subroutines where the arg is indirected
+      auto BC = dyn_cast<BitCastInst>(Inst);
+      if (!BC || !Pass->Liveness->isBitCastCoalesced(BC)) {
+        CanCoalesceWithoutKill = false;
+        break;
+      }
+    }
+  }
+
+  // Create an object of some subclass of CallSite for each call site.
+  for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) {
+    auto CI = cast<CallInst>(ui->getUser());
+    assert(ui->getOperandNo() == CI->getNumArgOperands());
+    auto CallSite = createCallSite(CI);
+    if (!CallSite)
+      return Indirectability::CANNOT_INDIRECT;
+    CallSites.push_back(CallSite);
+    LLVM_DEBUG(dbgs() << "  " << *CallSite << "\n");
+  }
+  // Check indirection state for each call site.
+  unsigned States = 0;
+  for (auto csi = CallSites.begin(), cse = CallSites.end(); csi != cse; ++csi) {
+    auto CallSite = *csi;
+    States |= CallSite->getState();
+  }
+  switch (States & (Indirectability::NO_OPTIMIZATION | Indirectability::WANT_INDIRECTION)) {
+  case Indirectability::NO_OPTIMIZATION | Indirectability::WANT_INDIRECTION:
+      return Indirectability::WANT_SOME_INDIRECTION;
+  case Indirectability::WANT_INDIRECTION:
+      return Indirectability::WANT_INDIRECTION;
+  }
+  return Indirectability::NO_OPTIMIZATION;
+}
+
+/***********************************************************************
+ * createCallSite : create a CallSite object for this call
+ *
+ * Enter:   CI = CallInst
+ *          this->Arg = the Argument to look at
+ *          this->ArgLR = its LiveRange
+ *          this->CoalescedRetIdx = -1 else struct index of coalesced return value
+ *
+ * Return:  0 if this call stops arg indirection happening for this arg
+ *          otherwise object of some subclass of CallSite
+ */
+CallSite *SubroutineArg::createCallSite(CallInst *CI)
+{
+  // Check if this call site is in a function that is itself indirecting the
+  // arg.
+  if (auto SubrArg = Pass->FuncMap[CI->getParent()->getParent()])
+    return new CallerIndirectingCallSite(CI, SubrArg);
+  // Look at the call arg.
+  Value *V = CI->getArgOperand(Arg->getArgNo());
+  // Skip any coalesced bitcasts.
+  while (auto BC = dyn_cast<BitCastInst>(V)) {
+    if (Pass->Liveness->getLiveRangeOrNull(BC->getOperand(0)) != ArgLR)
+      break;
+    V = BC->getOperand(0);
+  }
+  // If the call arg (before coalesced bitcasts) is a wrregion where the arg
+  // is the only use, try and parse it as a rd-wr sequence that reads a
+  // contiguous region and writes the whole of Arg.
+  RdWrRegionSequence ArgRWS;
+  if (!V->hasOneUse() || !GenXIntrinsic::isWrRegion(V)
+      || !ArgRWS.buildFromWr(cast<Instruction>(V), Pass->Baling)
+      || !ArgRWS.RdR.isContiguous() || !ArgRWS.WrR.isWhole(Arg->getType())) {
+    // Failed to find such a rd-wr sequence. Set ArgRWS to null.
+    ArgRWS = RdWrRegionSequence();
+  }
+  // Look at the retval.
+  RdWrRegionSequence RetRWS;
+  if (CoalescedRetIdx >= 0) {
+    Value *RetVal = getRetVal(CI, CoalescedRetIdx);
+    if (!RetVal) {
+      // getRetVal could not determine what happens to this return value.
+      DiagnosticInfoArgIndirection Warn(CI, Arg,
+          "Coalesced return value has unknown uses", DS_Warning);
+      CI->getContext().diagnose(Warn);
+      return nullptr;
+    }
+    if (!isa<UndefValue>(RetVal)) {
+      // See if the return value has a single use in (after skipping coalesced
+      // bitcasts) a single wrregion or a rd-wr sequence.
+      // First skip single use coalesced bitcasts.
+      while (!RetVal->use_empty()) {
+        auto User = cast<Instruction>(RetVal->use_begin()->getUser());
+        if (RetVal->hasOneUse()) {
+          if (auto BC = dyn_cast<BitCastInst>(User)) {
+            if (Pass->Liveness->getLiveRange(BC) == ArgLR) {
+              // Skip coalesced bitcast.
+              RetVal = BC;
+              continue;
+            }
+          }
+        }
+        // Attempt to parse as a rd-wr sequence that reads the whole of RetVal
+        // and writes a contiguous region, so it is either a legalized copy, or
+        // a legalized contiguous wrregion, and it is the only use of the input.
+        if (!GenXIntrinsic::isRdRegion(User)
+            || RetVal->use_begin()->getOperandNo()
+              != GenXIntrinsic::GenXRegion::OldValueOperandNum
+            || !RetRWS.buildFromRd(User, Pass->Baling)
+            || !RetRWS.WrR.isContiguous()
+            || !RetRWS.RdR.isWhole(RetVal->getType())
+            || !RetRWS.isOnlyUseOfInput()) {
+          // That failed, so make RetRWS null.
+          RetRWS = RdWrRegionSequence();
+        }
+        break;
+      }
+    }
+  }
+
+  // Now check the various cases. This results in the creation of an object of
+  // some subclass of CallSite.
+
+  // Check that the regions are contiguous, and report if they are not.
+  if (ArgRWS.isNull() && !ArgRWS.RdR.isContiguous()) {
+    DiagnosticInfoArgIndirection Warn(CI, Arg,
+        "Non-contiguous region", DS_Warning);
+    CI->getContext().diagnose(Warn);
+    return new NoOptCallSite(CI);
+  }
+  if (RetRWS.isNull() && !RetRWS.WrR.isContiguous()) {
+    DiagnosticInfoArgIndirection Warn(CI, Arg,
+        "Non-contiguous region for coalesced return value", DS_Warning);
+    CI->getContext().diagnose(Warn);
+    return new NoOptCallSite(CI);
+  }
+
+  // Case 1: The call arg is constant (inc undef, or a legalized constant
+  // load), and the retval is input to a wrregion sequence where the "old
+  // value" input is also a constant (a legalized constant load, also allowing
+  // for a bitcast). This typically happens when the arg and ret were a by ref
+  // region of a matrix, but the matrix was initialized to constant, or not
+  // initialized at all, before the call, so the rdregion got simplified away.
+  if (!RetRWS.isNull()) {
+    Value *RetOldVal = RetRWS.OldVal;
+    while (auto BC = dyn_cast<BitCastInst>(RetOldVal))
+      RetOldVal = BC->getOperand(0);
+    auto *RetOldValC = dyn_cast<Constant>(RetOldVal);
+    if (!RetOldValC && GenXIntrinsic::isWrRegion(RetOldVal)) {
+      RdWrRegionSequence ConstRWS;
+      if (ConstRWS.buildFromWr(cast<Instruction>(RetOldVal), Pass->Baling))
+        RetOldValC = dyn_cast<Constant>(ConstRWS.Input);
+    }
+    if (RetOldValC) {
+      Constant *Input;
+      if (!ArgRWS.isNull())
+        Input = dyn_cast<Constant>(ArgRWS.Input);
+      else
+        Input = dyn_cast<Constant>(CI->getArgOperand(Arg->getArgNo()));
+      if (Input) {
+        // Get the Input constant to the same element type as RetOldValC.
+        if (RetOldValC->getType()->getScalarType()
+            != Input->getType()->getScalarType()) {
+          Type *ElTy = RetOldValC->getType()->getScalarType();
+          assert(ElTy->getPrimitiveSizeInBits());
+          Input = ConstantExpr::getBitCast(Input,
+              VectorType::get(ElTy,
+                Input->getType()->getPrimitiveSizeInBits()
+                  / ElTy->getPrimitiveSizeInBits()));
+        }
+        // Construct the constant that needs to be loaded.
+        assert(RetOldValC->getType()->getScalarType() == Input->getType()->getScalarType());
+        auto LdConst = RetRWS.WrR.evaluateConstantWrRegion(RetOldValC, Input);
+        // Create the ConstArgRetCallSite object.
+        return new ConstArgRetCallSite(CI, LdConst, RetRWS.EndWr,
+            RetRWS.getWrIndex());
+      }
+      DiagnosticInfoArgIndirection Warn(CI, Arg,
+          "Coalesced return value does not match constant argument", DS_Warning);
+      CI->getContext().diagnose(Warn);
+      return nullptr;
+    }
+  }
+
+  // Case 2: The call arg is a legalized contiguous rdregion or copy of
+  // non-constant, and there is no retval coalesced with it.
+  if (RetRWS.isNull() && !ArgRWS.isNull() && CoalescedRetIdx < 0
+      && !isa<Constant>(ArgRWS.Input)) {
+    // It is valid to indirect this arg only if one of these is true:
+    // 1. the input to ArgRWS is not live over the call, or
+    // 2. the coalesced live range for the arg is not written to inside the
+    //    subroutine or anything it calls.
+    if (CanCoalesceWithoutKill || !Pass->Liveness->getLiveRange(ArgRWS.Input)
+            ->contains(Pass->Numbering->getNumber(CI)))
+      return new IndirectArgCallSite(CI, ArgRWS.getInputUse(),
+          ArgRWS.getRdIndex());
+    DiagnosticInfoArgIndirection Warn(CI, Arg,
+        "Argument is region in value that is live over call", DS_Warning);
+    CI->getContext().diagnose(Warn);
+    return nullptr;
+  }
+
+  // Case 3: The call arg is a legalized rdregion or copy of non-constant, and
+  // the coalesced retval is a legalized wrregion or copy with the same region
+  // and the same base register.
+  if (!RetRWS.isNull() && !ArgRWS.isNull() && CoalescedRetIdx >= 0
+      && !isa<Constant>(ArgRWS.Input)) {
+    // Check the regions are the same.
+    if (ArgRWS.RdR == RetRWS.WrR) {
+      // Check the base registers are the same.
+      if (Pass->Liveness->getLiveRange(ArgRWS.Input)
+          == Pass->Liveness->getLiveRange(RetRWS.EndWr))
+        return new IndirectArgRetCallSite(CI, ArgRWS.getInputUse(),
+            RetRWS.EndWr, ArgRWS.getRdIndex());
+    }
+    DiagnosticInfoArgIndirection Warn(CI, Arg,
+        "Coalesced return value does not match argument", DS_Warning);
+    CI->getContext().diagnose(Warn);
+    return nullptr;
+  }
+
+  // Case 4: No optimization for this call site, and cannot even indirect it
+  // unless either there is a coalesced retval, or the subroutine arg's LR is
+  // not written inside the subroutines, or the call arg is killed at the call.
+  if (!CanCoalesceWithoutKill && !ArgRWS.isNull() && !isa<Constant>(ArgRWS.Input)
+        && Pass->Liveness->getLiveRange(ArgRWS.Input)
+          ->contains(Pass->Numbering->getNumber(CI))) {
+    DiagnosticInfoArgIndirection Warn(CI, Arg,
+        "Argument is value that is live over call", DS_Warning);
+    CI->getContext().diagnose(Warn);
+    return nullptr;
+  }
+
+  // Case 5: No optimization for this call site (but it can still be indirected
+  // if some other call site would get optimized).
+  return new NoOptCallSite(CI);
+}
+
+/***********************************************************************
+ * getIndirectAlignment : get worst-case alignment of indices if we indirect
+ *                        this arg and retval
+ */
+Alignment SubroutineArg::getIndirectAlignment() const
+{
+  Alignment Align(5, 0); // best case is GRF aligned
+  for (auto csi = CallSites.begin(), cse = CallSites.end();
+      csi != cse; ++csi) {
+    auto CallSite = *csi;
+    Value *Index = CallSite->getIndex();
+    if (!Index)
+      continue;
+    Align = Align.merge(Pass->AI->get(Index));
+  }
+  return Align;
+}
+
+/***********************************************************************
+ * gatherBalesToModify : check whether the arg can be indirected and
+ *        gather the bales that need modifying
+ *
+ * Enter:   Align = the worst case alignment of the indirection
+ *          this->BalesToModify = vector to populate
+ *
+ * Return:  true if can be indirected, with
+ *            BalesToModify populated with bales that need indirecting
+ */
+bool GenXArgIndirection::gatherBalesToModify(LiveRange *ArgLR, Alignment Align)
+{
+  LLVM_DEBUG(dbgs() << "gatherBalesToModify: alignment " << Align << "\n");
+  BalesToModify.clear();
+  // We call SubroutineArg::gatherBalesToModify for each subroutine that has
+  // an arg in this live range. Just gathering bales for all instructions and
+  // args in the live range in one go would not work, because there might be a
+  // call site where the call arg is coalesced, and we would end up indirecting
+  // it and other things it is coalesced with.
+  for (auto si = SubrArgs.begin(), se = SubrArgs.end(); si != se; ++si)
+    si->gatherBalesToModify(Align);
+  // Check the bales to see if we can legally indirect accesses to any value in
+  // ArgLR (i.e. the arg, the retval, and anything coalesced with it) by doing
+  // a dry run of modifying them.
+  for (auto btmi = BalesToModify.begin(), btme = BalesToModify.end();
+      btmi != btme; ++btmi) {
+    Bale B;
+    Baling->buildBale(*btmi, &B);
+    if (!checkIndirectBale(&B, ArgLR, Align)) {
+      // Failure. For error reporting, get the arg for the function in which the
+      // failure occurred.
+      Argument *Arg = getArgForFunction(ArgLR, B.getHead()->Inst->getParent()
+            ->getParent());
+      DiagnosticInfoArgIndirection Warn(B.getHead()->Inst, Arg,
+          "Use of argument cannot be indirected", DS_Warning);
+      B.getHead()->Inst->getContext().diagnose(Warn);
+      return false;
+    }
+  }
+  return true;
+}
+
+/***********************************************************************
+ * gatherBalesToModify : gather the bales that need modifying for this one
+ *    subroutine arg
+ *
+ * Enter:   Align = the worst case alignment of the indirection
+ *          Pass->BalesToModify = vector to populate
+ *
+ * Return:  BalesToModify populated with bales that need indirecting
+ */
+void SubroutineArg::gatherBalesToModify(Alignment Align)
+{
+  std::set<Instruction *> BalesSeen;
+  for (auto vi = ArgLR->value_begin(), ve = ArgLR->value_end(); vi != ve; ++vi) {
+    Value *V = vi->getValue();
+    if (Pass->Liveness->isUnifiedRet(V))
+      continue; // ignore unified ret
+    if (auto Inst = dyn_cast<Instruction>(V)) {
+      if (Inst->getParent()->getParent() != F)
+        continue; // ignore instruction in wrong function
+      // Add the def to the list of bales that will need modifying, unless
+      // it is a phi node or coalesced bitcast or insert/extract in struct
+      // or a non-intrinsic call.
+      if (!isa<PHINode>(Inst) && (!isa<BitCastInst>(Inst)
+          || Pass->Liveness->getLiveRange(Inst->getOperand(0)) != ArgLR)
+          && !isa<InsertValueInst>(Inst) && !isa<ExtractValueInst>(Inst)
+          && (!isa<CallInst>(Inst)
+            || GenXIntrinsic::isAnyNonTrivialIntrinsic(Inst)))
+        if (BalesSeen.insert(Inst).second)
+          Pass->BalesToModify.push_back(Inst);
+    } else if (V != Arg)
+      continue; // ignore arg in wrong function
+    for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) {
+      auto User = cast<Instruction>(ui->getUser());
+      if (auto CI = dyn_cast<CallInst>(User)) {
+        Function *CF = CI->getCalledFunction();
+        if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CF)) {
+          // Non-intrinsic call. Ignore. (A call site using an arg being
+          // indirected gets handled differently.)
+          continue;
+        }
+      } else {
+        if (isa<StructType>(User->getType()))
+          continue; // Ignore call with multiple retvals, or insert used to do
+                    // multiple retvals
+        if (isa<ExtractValueInst>(User))
+          continue; // Ignore extract in struct used to do multiple retvals
+        if (isa<PHINode>(User))
+          continue; // Ignore phi nodes
+        if (isa<ReturnInst>(User))
+          continue; // Ignore return instruction
+        if (isa<BitCastInst>(User) && Pass->Liveness->getLiveRange(User) == ArgLR)
+          continue; // Ignore coalesced bitcast
+      }
+      // Add the head of the bale to the list of bales that will need modifying.
+      auto UserHead = Pass->Baling->getBaleHead(User);
+      if (BalesSeen.insert(UserHead).second)
+        Pass->BalesToModify.push_back(UserHead);
+    }
+  }
+}
+
+/***********************************************************************
+ * checkIndirectBale : check if a bale can be indirected
+ *
+ * Enter:   B = bale to check
+ *          ArgLR = live range of values that need to be indirected
+ *          Align = alignment of index being introduced
+ *
+ * Return:  true if can be indirected
+ */
+bool GenXArgIndirection::checkIndirectBale(Bale *B, LiveRange *ArgLR,
+    Alignment Align)
+{
+  auto MainInst = B->getMainInst();
+  if (MainInst) {
+    // Check for things about the main instruction that stop us indexing
+    // operand(s) or result in this bale.
+    if (MainInst->Inst->getType()->getPrimitiveSizeInBits() > 256
+        && !ST->hasIndirectGRFCrossing()) {
+      // An execution size bigger than 1 GRF disqualifies the main
+      // instruction on <= BDW.
+      LLVM_DEBUG(dbgs() << "execution size bigger than GRF\n");
+      return false;
+    }
+    unsigned IID = GenXIntrinsic::getAnyIntrinsicID(MainInst->Inst);
+    if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IID)) {
+      // Cannot indirect a raw operand. We approximate this conservatively by
+      // spotting an intrinsic with void return type or with raw result.
+      if (MainInst->Inst->getType()->isVoidTy()) {
+        LLVM_DEBUG(dbgs() << "intrinsic with void return type assumed to have raw operands\n");
+        return false;
+      }
+      if (GenXIntrinsicInfo(IID).getRetInfo().isRaw()) {
+        LLVM_DEBUG(dbgs() << "intrinsic with raw return value\n");
+        return false;
+      }
+    }
+  }
+  // Check the rdregion(s) and wrregion.
+  for (auto bi = B->begin(), be = B->end(); bi != be; ++bi) {
+    switch (bi->Info.Type) {
+      case BaleInfo::WRREGION:
+        // Check wrregion if its result is coalesced with arg.
+        if (Liveness->getLiveRange(bi->Inst) == ArgLR) {
+          Region R(bi->Inst, bi->Info);
+          if (R.Indirect)
+            break; // already indirect
+          // Fake up scalar indirect index for the benefit of getLegalSize.
+          // It doesn't matter what the value is, as long as it is scalar.
+          R.Indirect = bi->Inst->getOperand(
+              GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+          if (R.NumElements != R.getLegalSize(0, /*Allow2D=*/false,
+                /*InputNumElements=*/UINT_MAX, ST, Align)) {
+            LLVM_DEBUG(dbgs() << "wrregion cannot be indirected: " << R << "\n");
+            return false;
+          }
+        }
+        break;
+      case BaleInfo::RDREGION:
+        // Check rdregion if its input is coalesced with arg.
+        if (Liveness->getLiveRange(bi->Inst->getOperand(0)) == ArgLR) {
+          Region R(bi->Inst, bi->Info);
+          if (R.Indirect)
+            break; // already indirect
+          // Fake up scalar indirect index for the benefit of getLegalSize.
+          // It doesn't matter what the value is, as long as it is scalar.
+          R.Indirect = bi->Inst->getOperand(
+              GenXIntrinsic::GenXRegion::RdIndexOperandNum);
+          if (R.NumElements != R.getLegalSize(0, /*Allow2D=*/true,
+                /*InputNumElements=*/UINT_MAX, ST, Align)) {
+            LLVM_DEBUG(dbgs() << "rdregion cannot be indirected: " << R << "\n";
+              dbgs() << R.getLegalSize(0, /*Allow2D=*/true,
+                  /*InputNumElements=*/UINT_MAX, ST, Align) << "\n");
+            return false;
+          }
+        }
+        break;
+      default:
+        break;
+    }
+  }
+  return true;
+}
+
+/***********************************************************************
+ * addAddressArg : for this subroutine, replace the Function with a new
+ *    one with an extra address arg, and modify all call sites
+ *
+ * This sets this->NewFunc, and modifies this->Arg to the argument in the
+ * new function.
+ */
+void SubroutineArg::addAddressArg()
+{
+  // Create the new function type.
+  auto FTy = F->getFunctionType();
+  SmallVector<Type *, 4> ArgTys;
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    ArgTys.push_back(FTy->getParamType(i));
+  ArgTys.push_back(Type::getInt16Ty(F->getContext()));
+  FTy = FunctionType::get(FTy->getReturnType(), ArgTys, false);
+  // Create the new function.
+  NewFunc = Function::Create(FTy, F->getLinkage(), "");
+  NewFunc->takeName(F);
+  NewFunc->copyAttributesFrom(F);
+  F->getParent()->getFunctionList().insert(F->getIterator(), NewFunc);
+  // Set the new function's number to the same as the old function.
+  Pass->Numbering->setNumber(NewFunc, Pass->Numbering->getNumber(F));
+  // Move the original function's unified return value across to the new
+  // function.
+  Pass->Liveness->moveUnifiedRet(F, NewFunc);
+  // The Function itself has a live range to represent the ranges of the
+  // subroutine itself and everything it calls. Change the Function in that
+  // live range.
+  Pass->Liveness->replaceValue(F, NewFunc);
+  // Populate arrays OldArgs (the original func's args) and NewArgs (the new
+  // func's args).
+  SmallVector<Argument *, 4> OldArgs, NewArgs;
+  for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ++ai)
+    OldArgs.push_back(&*ai);
+  for (auto ai = NewFunc->arg_begin(), ae = NewFunc->arg_end(); ai != ae; ++ai)
+    NewArgs.push_back(&*ai);
+  // For the original args, change uses to use the new args instead. Also
+  // change the old arg's live range to have the new arg instead.
+  for (unsigned ArgNum = 0; ArgNum != OldArgs.size(); ++ArgNum) {
+    NewArgs[ArgNum]->setName(OldArgs[ArgNum]->getName());
+    OldArgs[ArgNum]->replaceAllUsesWith(NewArgs[ArgNum]);
+    Pass->Liveness->replaceValue(OldArgs[ArgNum], NewArgs[ArgNum]);
+  }
+  // Change the Arg in the current SubroutineArg, and save the address arg.
+  Arg = NewArgs[Arg->getArgNo()];
+  AddressArgument = NewArgs.back();
+  // Give the address arg a live range, and mark that it needs calculating.
+  auto LR = Pass->Liveness->getOrCreateLiveRange(AddressArgument);
+  LR->setCategory(RegCategory::ADDRESS);
+  Pass->LRsToCalculate.push_back(LR);
+  // Set the name of the new address arg.
+  NewArgs[OldArgs.size()]->setName(Arg->getName() + ".addr");
+  // Move the function code across.
+  NewFunc->getBasicBlockList().splice(NewFunc->begin(), F->getBasicBlockList());
+}
+
+/***********************************************************************
+ * fixCallSites : fix up a call to the subroutine, so it calls the new
+ *    function instead and passes the extra address arg
+ *
+ * For each call site, this calls the process() method on the object of a
+ * subclass of CallSite set up by createCallSite(). That returns the extra
+ * address arg, which this function then uses to create a replacement call
+ * instruction.
+ */
+void SubroutineArg::fixCallSites()
+{
+  for (auto csi = CallSites.begin(), cse = CallSites.end(); csi != cse; ++csi) {
+    auto CallSite = *csi;
+    LLVM_DEBUG(dbgs() << "  fixCallSites: [" << Pass->Numbering->getNumber(CallSite->CI)
+        << "] " << *CallSite << "\n");
+    // Process the call site.
+    // Create the replacement call instruction, with an added address arg that
+    // for now we set to undef. We do this first so that process() called below
+    // can modify the arg being indirected such that the eraseUnusedTree erases
+    // the rd-wr sequence that sets up the arg in the old call.
+    SmallVector<Value *, 4> Args;
+    for (unsigned oi = 0, oe = CallSite->CI->getNumArgOperands();
+        oi != oe; ++oi)
+      Args.push_back(CallSite->CI->getArgOperand(oi));
+    Args.push_back(UndefValue::get(Type::getInt16Ty(CallSite->CI->getContext())));
+    CallInst *OldCI = CallSite->CI;
+    CallSite->CI = CallInst::Create(NewFunc, Args, "", OldCI);
+    CallSite->CI->takeName(OldCI);
+    CallSite->CI->setDebugLoc(OldCI->getDebugLoc());
+    Pass->Numbering->setNumber(CallSite->CI, Pass->Numbering->getNumber(OldCI));
+    Pass->Numbering->setStartNumber(CallSite->CI,
+          Pass->Numbering->getStartNumber(OldCI));
+    // Get the subclass of CallSite to do its processing, returning the extra
+    // address arg for the call.
+    Value *AddressArg = CallSite->process(Pass, this);
+    LLVM_DEBUG(dbgs() << "    AddressArg is " << AddressArg->getName() << "\n");
+    if (!isa<UndefValue>(AddressArg)) {
+      // Create a live range for the address arg, and ensure it is recalculated.
+      LiveRange *AddressArgLR = Pass->Liveness->getOrCreateLiveRange(AddressArg);
+      AddressArgLR->setCategory(RegCategory::ADDRESS);
+      Pass->LRsToCalculate.push_back(AddressArgLR);
+    }
+    // Use the address arg in the new call.
+    CallSite->CI->setOperand(Args.size() - 1, AddressArg);
+    // Replace the old call with the new one, and erase the old one. We use
+    // eraseUnusedTree so that any rd-wr sequence for the indirected arg is also
+    // erased.
+    OldCI->replaceAllUsesWith(CallSite->CI);
+    Pass->Liveness->replaceValue(OldCI, CallSite->CI);
+    Pass->Liveness->eraseUnusedTree(OldCI);
+  }
+}
+
+/***********************************************************************
+ * CallerIndirectingCallSite::process : arg indirection processing for a call
+ *    site in a subroutine that is itself indirecting the arg
+ *
+ * Return:  the address arg that needs to be passed to the call
+ */
+Value *CallerIndirectingCallSite::process(GenXArgIndirection *Pass,
+      SubroutineArg *SubrArg)
+{
+  return CallerSubrArg->AddressArgument;
+}
+
+/***********************************************************************
+ * NoOptCallSite::process : arg indirection processing for a call site where
+ *    no optimization is possible, but we can still indirect
+ *
+ * Return:  the address arg that needs to be passed to the call
+ */
+Value *NoOptCallSite::process(GenXArgIndirection *Pass, SubroutineArg *SubrArg)
+{
+  unsigned InsertNumber = Pass->Numbering->getArgIndirectionNumber(
+      CI, CI->getNumArgOperands() - 1, 0);
+  Instruction *InsertBefore = CI;
+  Type *I16Ty = Type::getInt16Ty(CI->getContext());
+  // If the arg is undef, we can just use an undef address.
+  if (isa<UndefValue>(CI->getArgOperand(SubrArg->Arg->getArgNo())))
+    return UndefValue::get(I16Ty);
+  // Create a convert.addr of index 0, just before the call with the number of
+  // the arg pre-copy site for the new address argument that will be added.
+  auto Conv = createConvertAddr(ConstantInt::get(I16Ty, 0), 0,
+      SubrArg->Arg->getName() + ".indirect", InsertBefore);
+  Conv->setDebugLoc(CI->getDebugLoc());
+  Pass->Numbering->setNumber(Conv, InsertNumber);
+  // Tell GenXLiveness the base register for this address register. The normal
+  // mechanism of tracing through to a user of the address does not work for an
+  // indirected arg.
+  Pass->Liveness->setArgAddressBase(Conv,
+      CI->getArgOperand(SubrArg->Arg->getArgNo()));
+  // If the live range of the input does not reach over the call, add a
+  // use of it (an unused bitcast) after the call and recalculate the
+  // live range.
+  unsigned CINumber = Pass->Numbering->getNumber(CI);
+  Value *Input = CI->getOperand(SubrArg->Arg->getArgNo());
+  LiveRange *InputLR = Pass->Liveness->getLiveRange(Input);
+  if (!InputLR->contains(CINumber)) {
+    auto BC = CastInst::Create(Instruction::BitCast, Input, Input->getType(),
+        Input->getName() + ".dummy_use_for_indirection", CI->getNextNode());
+    Pass->Liveness->setLiveRange(BC, InputLR);
+    Pass->Numbering->setNumber(BC, CINumber + 1);
+    Pass->LRsToCalculate.push_back(InputLR);
+  }
+  return Conv;
+}
+
+/***********************************************************************
+ * ConstArgRetCallSite::process : arg indirection processing for a call site
+ *      where the arg is constant (including undef) and the arg is coalesced
+ *      with a retval that is used only in a legalized wrregion whose "old
+ *      value" input is constant.
+ *
+ * Return:  the address arg that needs to be passed to the call
+ */
+Value *ConstArgRetCallSite::process(GenXArgIndirection *Pass,
+      SubroutineArg *SubrArg)
+{
+  // checkCallSites detected the situation where the arg is a constant
+  // (probably a legalized constant load, detected by RdWrRegionSequence,
+  // but also including undef), and the ret is wrregioned (probably
+  // legalized) with a constant as the "old value" operand (including
+  // undef).
+  //
+  // To handle this, we create a new constant load of the two constants
+  // combined, before the call, to turn it back into the normal situation
+  // of a legalized rdregion before the call and a legalized wrregion
+  // after the call. (However we don't actually create the legalized
+  // rdregion and wrregion.)
+  //
+  // The combined constant was created in checkCallSites, and in this object
+  // it is LdConst.
+  //
+  // Any new instruction is inserted just before the call, and given the
+  // instruction number of the address arg's pre-copy slot.
+  Instruction *InsertBefore = CI;
+  unsigned InsertNumber = Pass->Numbering->getArgIndirectionNumber(
+        CI, CI->getNumArgOperands() - 1, 0);
+  // Insert a load the constant. Bitcast it to the right type to replace
+  // RetEndWr.
+  SmallVector<Instruction *, 4> AddedInsts;
+  ConstantLoader CL(LdConst, nullptr, &AddedInsts);
+  auto LoadedConst = CL.loadBig(InsertBefore);
+  assert(LoadedConst);
+  if (LoadedConst->getType() != RetEndWr->getType()) {
+    LoadedConst = CastInst::Create(Instruction::BitCast, LoadedConst,
+          RetEndWr->getType(), LoadedConst->getName() + ".bitcast",
+          InsertBefore);
+    AddedInsts.push_back(LoadedConst);
+  }
+  // An added instruction (from the constant load) is allocated a live range as
+  // follows:
+  // 1. An instruction with the right result size is assumed to be coalesceable
+  // with the final result, and so put in the same live range as the retval's
+  // wrregion.
+  // 2. A (smaller) wrregion is assumed to be coalesceable with its "old value"
+  // input, if that is an instruction.
+  // 3. Otherwise it gets its own new live range.
+  // A wrregion also needs to be marked as such in baling.
+  auto RetValWrLR = Pass->Liveness->getLiveRange(RetEndWr);
+  unsigned LoadedConstSize = LoadedConst->getType()->getPrimitiveSizeInBits();
+  for (auto i = AddedInsts.begin(), e = AddedInsts.end(); i != e; ++i) {
+    auto Inst = *i;
+    Pass->Numbering->setNumber(Inst, InsertNumber);
+    LiveRange *LR = nullptr;
+    if (Inst->getType()->getPrimitiveSizeInBits() == LoadedConstSize)
+      Pass->Liveness->setLiveRange(Inst, LR = RetValWrLR);
+    if (GenXIntrinsic::isWrRegion(Inst)) {
+      BaleInfo BI(BaleInfo::WRREGION);
+      if (isa<Instruction>(Inst->getOperand(
+              GenXIntrinsic::GenXRegion::NewValueOperandNum)))
+        BI.setOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+      Pass->Baling->setBaleInfo(Inst, BI);
+      if (auto InInst = dyn_cast<Instruction>(
+            Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)))
+        if (!LR)
+          Pass->Liveness->setLiveRange(Inst,
+              LR = Pass->Liveness->getLiveRange(InInst));
+    }
+    if (!LR) {
+      LR = Pass->Liveness->getOrCreateLiveRange(Inst);
+      LR->setCategory(RegCategory::GENERAL);
+    }
+    Pass->LRsToCalculate.push_back(LR);
+  }
+  // Create the genx.convert.addr for the region of that constant load. We
+  // use the offset of the retval's legalized wrregion.
+  auto AddressArg = createConvertAddr(Index, 0,
+      SubrArg->Arg->getName() + ".indirect", InsertBefore);
+  AddressArg->setDebugLoc(CI->getDebugLoc());
+  Pass->Numbering->setNumber(AddressArg, InsertNumber);
+  // Tell GenXLiveness the base register for this address register.
+  // The normal mechanism of tracing through to a user of the address
+  // does not work for an indirected arg.
+  Pass->Liveness->setArgAddressBase(AddressArg, LoadedConst);
+  // Undef out the arg in the call, so the old code to load the constant (if
+  // any) gets erased when the call is erased.
+  unsigned CallArgNum = SubrArg->Arg->getArgNo();
+  CI->setOperand(CallArgNum,
+      UndefValue::get(CI->getOperand(CallArgNum)->getType()));
+  // Replace uses of the (legalized) wrregion sequence with the newly inserted
+  // constant load, then erase the sequence.
+  Instruction *ToErase = RetEndWr;
+  RetEndWr = nullptr; // need to do this as RetEndWr is an AssertingVH
+  Pass->replaceAndEraseSequence(ToErase, LoadedConst);
+  return AddressArg;
+}
+
+/***********************************************************************
+ * IndirectArgCallSite::process : arg indirection processing for a call site
+ *      where the arg is a legalized rdregion or copy, and there is no retval
+ *      coalesced with it.
+ *
+ * Return:  the address arg that needs to be passed to the call
+ */
+Value *IndirectArgCallSite::process(GenXArgIndirection *Pass,
+      SubroutineArg *SubrArg)
+{
+  // Any new instruction is inserted just before the call, and given the
+  // instruction number of the address arg's pre-copy slot.
+  Instruction *InsertBefore = CI;
+  unsigned InsertNumber = Pass->Numbering->getArgIndirectionNumber(CI,
+      CI->getNumArgOperands() - 1, 0);
+  Value *AddressArg = nullptr;
+  if (isa<Constant>(Index)) {
+    // Constant index for the region. Add a convert.addr to load it into an
+    // address register.
+    auto Conv = createConvertAddr(Index, 0,
+        SubrArg->Arg->getName() + ".indirect", InsertBefore);
+    Conv->setDebugLoc(CI->getDebugLoc());
+    Pass->Numbering->setNumber(Conv, InsertNumber);
+    AddressArg = Conv;
+  } else {
+    // Variable index for the region. The index is already converted to an
+    // address. It might be a genx.add.addr baled in to the rdregion; if so
+    // unbale it.
+    if (auto IndexInst = dyn_cast<Instruction>(Index))
+      Pass->Baling->unbale(IndexInst);
+    AddressArg = Index;
+  }
+  // Tell GenXLiveness the base register for this address register.
+  // The normal mechanism of tracing through to a user of the address
+  // does not work for an indirected arg.
+  LiveRange *InputLR = Pass->Liveness->getLiveRange(*InputUse);
+  // Add a use of the input (an unused bitcast) in case:
+  // 1. the live range does not reach over the call (in which case we need to
+  //    recalculate the live range after adding this use), or
+  // 2. later on, another arg indirection removes a use, meaning that the live
+  //    range no longer reaches over the call (in which case we don't need to
+  //    recalculate the live range yet).
+  auto BC = CastInst::Create(Instruction::BitCast, *InputUse,
+      (*InputUse)->getType(),
+      (*InputUse)->getName() + ".dummy_use_for_indirection",
+      CI->getNextNode());
+  Pass->Liveness->setLiveRange(BC, InputLR);
+  Pass->Liveness->setArgAddressBase(AddressArg, BC);
+  unsigned CINumber = Pass->Numbering->getNumber(CI);
+  Pass->Numbering->setNumber(BC, CINumber + 1);
+  if (!InputLR->contains(CINumber))
+    Pass->LRsToCalculate.push_back(InputLR);
+  // Undef out the arg in the call, so the old rd-wr sequence for the arg gets
+  // erased when the call is erased.
+  unsigned CallArgNum = SubrArg->Arg->getArgNo();
+  CI->setOperand(CallArgNum,
+      UndefValue::get(CI->getOperand(CallArgNum)->getType()));
+  return AddressArg;
+}
+
+/***********************************************************************
+ * IndirectArgRetCallSite::process : arg indirection processing for a call site
+ *      where the arg is a legalized rdregion or copy, and the arg is coalesced
+ *      with a retval that is used only in a legalized wrregion or copy.
+ *
+ * Return:  the address arg that needs to be passed to the call
+ */
+Value *IndirectArgRetCallSite::process(GenXArgIndirection *Pass,
+      SubroutineArg *SubrArg)
+{
+  // Common code with IndirectArgCallSite above:
+  auto AddressArg = IndirectArgCallSite::process(Pass, SubrArg);
+  // Replace uses of the (legalized) wrregion sequence with the input to the
+  // legalized rdregion before the call.
+  Instruction *ToReplace = RetEndWr;
+  RetEndWr = nullptr; // Needed as RetEndWr is an AssertingVH
+  Pass->replaceAndEraseSequence(ToReplace, *InputUse);
+  return AddressArg;
+}
+
+/***********************************************************************
+ * GenXArgIndirection::replaceAndEraseSequence : replace uses of a wrregion
+ *    sequence with a different value and erase the sequence, coping with
+ *    different types due to bitcast
+ *
+ * Enter:   RetEndWr = end of wrregion sequence
+ *          V = value to replace its uses with (not constant, so it has a
+ *              live range)
+ */
+void GenXArgIndirection::replaceAndEraseSequence(Instruction *RetEndWr, Value *V)
+{
+  // See if the types are different due to some bitcasting somewhere. First
+  // handle the case that V is the result of a bitcast whose input is the type
+  // we want. We can just use that input.
+  if (V->getType() != RetEndWr->getType())
+    if (auto BC = dyn_cast<BitCastInst>(V))
+      if (BC->getOperand(0)->getType() == RetEndWr->getType())
+        V = BC->getOperand(0);
+  // Then handle other different type cases by inserting our own bitcast.
+  if (V->getType() != RetEndWr->getType()) {
+    auto BC = CastInst::Create(Instruction::BitCast, V, RetEndWr->getType(),
+        V->getName() + ".bitcast", RetEndWr);
+    Numbering->setNumber(BC, Numbering->getNumber(RetEndWr));
+    Liveness->setLiveRange(BC, Liveness->getLiveRange(V));
+    V = BC;
+  }
+  // Replace uses and erase resulting tree of unused instructions.
+  RetEndWr->replaceAllUsesWith(V);
+  Liveness->eraseUnusedTree(RetEndWr);
+}
+
+/***********************************************************************
+ * coalesceAddressArgs : for the new address arg, attempt to coalesce at
+ *      each call site, inserting a copy on failure to coalesce
+ */
+void SubroutineArg::coalesceAddressArgs()
+{
+  LiveRange *AddressLR = Pass->Liveness->getLiveRange(AddressArgument);
+  unsigned ArgNum = AddressArgument->getArgNo();
+  for (unsigned csi = 0, cse = CallSites.size(); csi != cse; ++csi) {
+    auto CallSite = CallSites[csi];
+    Value *CallArg = CallSite->CI->getArgOperand(ArgNum);
+    if (isa<UndefValue>(CallArg))
+      continue;
+    LiveRange *CallArgLR = Pass->Liveness->getLiveRange(CallArg);
+    if (AddressLR == CallArgLR)
+      continue;
+    if (!Pass->Liveness->interfere(AddressLR, CallArgLR)) {
+      // No interference -- we can coalesce.
+      AddressLR = Pass->Liveness->coalesce(AddressLR, CallArgLR,
+          /*DisallowCASC=*/true);
+      continue;
+    }
+    // There is interference. This should not happen if the caller is another
+    // subroutine where we are indirecting the arg -- the new address args
+    // for each subroutine should coalesce together.
+    LLVM_DEBUG(dbgs() << "Failed to coalesce:\n " << *AddressLR << "\n " << *CallArgLR << "\n");
+    assert(!Pass->FuncMap[CallSite->CI->getParent()->getParent()]
+        && "new address args should coalesce together");
+    // We need to insert a copy, in the address arg's pre-copy slot. An address
+    // copy is done with a genx.convert, even though it is not actually doing a
+    // conversion.
+    auto Copy = createConvert(CallArg, CallArg->getName() + ".coalescefail",
+        CallSite->CI);
+    Copy->setDebugLoc(CallSite->CI->getDebugLoc());
+    Pass->Numbering->setNumber(Copy, Pass->Numbering->getArgPreCopyNumber(
+          CallSite->CI, ArgNum, 0));
+    // Add the new value in to AddressLR.
+    Pass->Liveness->setLiveRange(Copy, AddressLR);
+    CallSite->CI->setOperand(ArgNum, Copy);
+  }
+}
+
+/***********************************************************************
+ * replaceFunction : replace the old function with the new function
+ *
+ * This replaces the function in the FunctionGroup, and then erases the old
+ * function.
+ */
+void SubroutineArg::replaceFunction()
+{
+  Pass->FGA->replaceFunction(F, NewFunc);
+  F->eraseFromParent();
+  F = NewFunc;
+}
+
+/***********************************************************************
+ * indirectBale : modify a bale to be indirect
+ *
+ * Enter:   B = bale to modify
+ *          ArgLR = live range of values that need to be indirected
+ *          AddressArg = new argument for address
+ *
+ * On return, the bale struct is no longer valid.
+ */
+void GenXArgIndirection::indirectBale(Bale *B, LiveRange *ArgLR,
+    Argument *AddressArg)
+{
+  // Indirect the head of the bale, if its result is in ArgLR.
+  auto Inst = B->getHead()->Inst;
+  if (Liveness->getLiveRange(Inst) == ArgLR) {
+    if (B->getHead()->Info.Type == BaleInfo::WRREGION) {
+      // wrregion: just modify the index to indirect it.
+      indirectRegion(&Inst->getOperandUse(
+            GenXIntrinsic::GenXRegion::WrIndexOperandNum), AddressArg, Inst);
+    } else {
+      // No wrregion: we need to add one, and ensure that the original 
+      // instruction is baled into it.
+      Region R(Inst);
+      R.Indirect = AddressArg;
+      SmallVector<Use *, 4> Uses;
+      for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui)
+        Uses.push_back(&*ui);
+      auto NewWr = cast<Instruction>(R.createWrRegion(
+            UndefValue::get(Inst->getType()), Inst,
+            Inst->getName() + ".indirected", Inst->getNextNode(),
+            Inst->getDebugLoc()));
+      Liveness->setLiveRange(NewWr, ArgLR);
+      Liveness->removeValue(Inst);
+      for (auto ui = Uses.begin(), ue = Uses.end(); ui != ue; ++ui)
+        **ui = NewWr;
+      BaleInfo BI(BaleInfo::WRREGION);
+      BI.setOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+      Baling->setBaleInfo(NewWr, BI);
+    }
+  }
+  // Process operands in each instruction of the bale.
+  for (auto bi = B->begin(), be = B->end(); bi != be; ++bi) {
+    Inst = bi->Inst;
+    for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) {
+      if (bi->Info.isOperandBaled(oi))
+        continue; // Ignore within-bale operands
+      if (!oi && bi->Info.Type == BaleInfo::WRREGION)
+        continue; // Ignore "old value" input to wrregion
+      Value *Opnd = Inst->getOperand(oi);
+      if (Liveness->getLiveRangeOrNull(Opnd) != ArgLR)
+        continue; // Not in ArgLR, does not need indirecting
+      if (bi->Info.Type == BaleInfo::RDREGION
+          && oi == GenXIntrinsic::GenXRegion::OldValueOperandNum) {
+        // input to rdregion: just modify the index to indirect it.
+        indirectRegion(&bi->Inst->getOperandUse(
+              GenXIntrinsic::GenXRegion::RdIndexOperandNum), AddressArg, Inst);
+      } else {
+        // No rdregion: we need to add one, and ensure that it is baled in
+        // to the original instruction.
+        Region R(Opnd);
+        R.Indirect = AddressArg;
+        auto NewRd = R.createRdRegion(Opnd, Opnd->getName() + ".indirected",
+            Inst, Inst->getDebugLoc());
+        Inst->setOperand(oi, NewRd);
+        BaleInfo BI = bi->Info;
+        BI.setOperandBaled(oi);
+        Baling->setBaleInfo(Inst, BI);
+        BaleInfo NewRdBI(BaleInfo::RDREGION);
+        Baling->setBaleInfo(NewRd, NewRdBI);
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * indirectRegion : convert a rdregion/wrregion index operand to indirect
+ *
+ * Enter:   U = the rdregion/wrregion index operand use
+ *          AddressArg = the index to use
+ *          InsertBefore = where to insert new instructions
+ *
+ * If the rdregion/wrregion already has a variable index, then we create an
+ * instruction to remove its genx.convert.addr and add it to AddressArg with
+ * genx.add.addr.
+ */
+void GenXArgIndirection::indirectRegion(Use *U, Value *AddressArg,
+    Instruction *InsertBefore)
+{
+  Value *Addr = *U;
+  if (auto CI = dyn_cast<Constant>(Addr)) {
+    // Currently the index is constant.
+    if (CI->isNullValue()) {
+      *U = AddressArg;
+      return;
+    }
+    // Create a genx.add.addr and give it an instruction number one less
+    // than InsertBefore.
+    auto NewAdd = createAddAddr(AddressArg, CI, "indirect.offset", InsertBefore);
+    Numbering->setNumber(NewAdd, Numbering->getNumber(InsertBefore) - 1);
+    *U = NewAdd;
+    // If the constant is within offset range, bale the new genx.add.addr into
+    // its user.
+    if (GenXBaling::isBalableIndexAdd(NewAdd)) {
+      auto User = cast<Instruction>(U->getUser());
+      BaleInfo BI = Baling->getBaleInfo(User);
+      BI.setOperandBaled(U->getOperandNo());
+      Baling->setBaleInfo(User, BI);
+    } else {
+      // Otherwise, give it a live range, and mark it as needing calculating.
+      auto LR = Liveness->getOrCreateLiveRange(NewAdd);
+      LR->setCategory(RegCategory::ADDRESS);
+      LRsToCalculate.push_back(LR);
+    }
+    return;
+  }
+  // The index is already variable.
+  // Trace back through add_addr instructions until we find one of:
+  // 1. The convert_addr instruction set up by GenXCategory, and possibly
+  //    commoned up by GenXAddressCommoning. We replace that with an
+  //    add_addr instruction that adds the convert_addr's input to AddressArg.
+  // or
+  // 2. An Argument, so another user of the same address must have already
+  //    found and replaced (1).
+  for (;;) {
+    if (isa<Argument>(Addr))
+      return;
+    auto IntrinsicID = GenXIntrinsic::getGenXIntrinsicID(Addr);
+    switch (IntrinsicID) {
+    case GenXIntrinsic::genx_add_addr:
+      Addr = cast<Instruction>(Addr)->getOperand(0);
+      continue;
+    case GenXIntrinsic::genx_rdregioni:
+      Addr = cast<Instruction>(Addr)->getOperand(
+          GenXIntrinsic::GenXRegion::OldValueOperandNum);
+      continue;
+    case GenXIntrinsic::genx_convert_addr:
+      // we've found what we wanted
+      break;
+    default:
+      llvm_unreachable("unsupported instruction");
+    }
+    break;
+  }
+  assert(GenXIntrinsic::getGenXIntrinsicID(Addr) ==
+         GenXIntrinsic::genx_convert_addr);
+  auto AddrInst = cast<Instruction>(Addr);
+  auto AddrSrc = AddrInst->getOperand(0);
+   // Create an add_addr to replace the convert_addr. It needs a live range with
+  // ADDRESS category.
+  auto NewAddAddr = createAddAddr(AddressArg, AddrSrc,
+      AddrInst->getName() + ".indirectedaddr", AddrInst);
+  NewAddAddr->setDebugLoc(AddrInst->getDebugLoc());
+  Numbering->setNumber(NewAddAddr, Numbering->getNumber(AddrInst) - 1);
+  AddrInst->replaceAllUsesWith(NewAddAddr);
+  LiveRange *LR = Liveness->getOrCreateLiveRange(NewAddAddr);
+  LR->setCategory(RegCategory::ADDRESS);
+  LRsToCalculate.push_back(LR);
+  // AddrSrc (source of convert_addr) should get a live range as well
+  LiveRange *SrcLR = Liveness->getOrCreateLiveRange(AddrSrc);
+  SrcLR->setCategory(RegCategory::GENERAL);
+  LRsToCalculate.push_back(SrcLR);
+  // remove the old convert_addr
+  Liveness->eraseLiveRange(AddrInst);
+  AddrInst->eraseFromParent();
+}
+
+/***********************************************************************
+ * getArgForFunction : find the arg in a live range that belongs to a func
+ */
+Argument *GenXArgIndirection::getArgForFunction(LiveRange *LR, Function *F)
+{
+  for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) {
+    Value *V = vi->getValue();
+    if (auto Arg = dyn_cast<Argument>(V))
+      if (Arg->getParent() == F)
+        return Arg;
+  }
+  return nullptr;
+}
+
+/***********************************************************************
+ * getRetVal : get return value for possibly multi return value call
+ *
+ * Enter:   CI = call instruction
+ *          RetNum = return value number
+ *
+ * Return:  the return value (which is either a CallInst or an
+ *          ExtractValueInst), or 0 if unknown use, or undef if it is shown
+ *          that the requested return value is never extracted from the struct
+ */
+Value *SubroutineArg::getRetVal(CallInst *CI, unsigned RetNum)
+{
+  auto ST = dyn_cast<StructType>(CI->getType());
+  if (!ST) {
+    assert(!RetNum);
+    return CI;
+  }
+  Value *RetVal = UndefValue::get(ST->getElementType(RetNum));
+  for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) {
+    auto EVI = dyn_cast<ExtractValueInst>(ui->getUser());
+    if (!EVI || EVI->getNumIndices() != 1)
+      return nullptr; // unknown use
+    if (EVI->getIndices()[0] == RetNum) {
+      if (isa<UndefValue>(RetVal))
+        RetVal = EVI;
+      else
+        return nullptr; // multiple extractelements of the same retval
+    }
+  }
+  return RetVal;
+}
+
+/***********************************************************************
+ * DiagnosticInfoArgIndirection initializer from Instruction
+ *
+ * If the Instruction has a DebugLoc, then that is used for the error
+ * location.
+ * Otherwise, the location is unknown.
+ */
+DiagnosticInfoArgIndirection::DiagnosticInfoArgIndirection(Instruction *Inst,
+    Argument *Arg, const Twine &Desc, DiagnosticSeverity Severity)
+    : DiagnosticInfo(getKindID(), Severity), Line(0), Col(0)
+{
+  auto DL = Inst->getDebugLoc();
+  if (DL) {
+    Filename = DL->getFilename();
+    Line = DL.getLine();
+    Col = DL.getCol();
+  }
+  Description = (Twine("GenXArgIndirection failed for argument ")
+      + Twine(Arg->getArgNo() + 1) + " in " + Arg->getParent()->getName()
+      + ": " + Desc).str();
+}
+
+/***********************************************************************
+ * DiagnosticInfoArgIndirection::print : print the error/warning message
+ */
+void DiagnosticInfoArgIndirection::print(DiagnosticPrinter &DP) const
+{
+  std::string Loc(
+        (Twine(!Filename.empty() ? Filename : "<unknown>")
+        + ":" + Twine(Line)
+        + (!Col ? Twine() : Twine(":") + Twine(Col))
+        + ": ")
+      .str());
+  DP << Loc << Description;
+}
+
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.cpp
new file mode 100644
index 000000000000..9392dee402c9
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.cpp
@@ -0,0 +1,2365 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// GenX instruction baling is analyzed by this pass. See GenXBaling.h for more
+// detailed comment.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_INSTRUCTION_BALING"
+
+#include "GenXBaling.h"
+#include "GenXConstants.h"
+#include "GenXIntrinsics.h"
+#include "GenXLiveness.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+// Part of the bodge to allow abs to bale in to sext/zext. This needs to be set
+// to some arbitrary value that does not clash with any
+// GenXIntrinsicInfo::MODIFIER_* value.
+enum { MODIFIER_ABSONLY = 9000 };
+
+using namespace llvm;
+using namespace genx;
+using namespace GenXIntrinsic::GenXRegion;
+
+//----------------------------------------------------------------------
+// Administrivia for GenXFuncBaling pass
+//
+char GenXFuncBaling::ID = 0;
+INITIALIZE_PASS(GenXFuncBaling, "GenXFuncBaling", "GenXFuncBaling", false, false)
+
+FunctionPass *llvm::createGenXFuncBalingPass(BalingKind Kind, GenXSubtarget *ST)
+{
+  initializeGenXFuncBalingPass(*PassRegistry::getPassRegistry());
+  return new GenXFuncBaling(Kind, ST);
+}
+
+void GenXFuncBaling::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesCFG();
+}
+
+//----------------------------------------------------------------------
+// Administrivia for GenXGroupBaling pass
+//
+char GenXGroupBaling::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXGroupBaling, "GenXGroupBaling", "GenXGroupBaling", false, false)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_END(GenXGroupBaling, "GenXGroupBaling", "GenXGroupBaling", false, false)
+
+FunctionGroupPass *llvm::createGenXGroupBalingPass(BalingKind Kind, GenXSubtarget *ST)
+{
+  initializeGenXGroupBalingPass(*PassRegistry::getPassRegistry());
+  return new GenXGroupBaling(Kind, ST);
+}
+
+void GenXGroupBaling::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  FunctionGroupPass::getAnalysisUsage(AU);
+  AU.addRequired<GenXLiveness>();
+  AU.setPreservesCFG();
+  AU.addPreserved<GenXModule>();
+  AU.addPreserved<GenXLiveness>();
+}
+
+/***********************************************************************
+ * GenXGroupBaling::runOnFunctionGroup : run second baling pass on function
+ *    group
+ */
+bool GenXGroupBaling::runOnFunctionGroup(FunctionGroup &FG)
+{
+  clear();
+  Liveness = &getAnalysis<GenXLiveness>();
+  return processFunctionGroup(&FG);
+}
+
+/***********************************************************************
+ * processFunctionGroup : run instruction baling analysis on one
+ *  function group
+ */
+bool GenXBaling::processFunctionGroup(FunctionGroup *FG)
+{
+  bool Modified = false;
+  for (auto i = FG->begin(), e = FG->end(); i != e; ++i) {
+    Modified |= processFunction(*i);
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * processFunction : run instruction baling analysis on one function
+ *
+ * This does a preordered depth first traversal of the CFG to
+ * ensure that we see a def before its uses (ignoring phi node uses).
+ * This is required when we see a constant add/sub used as a region or
+ * element variable index; if the add/sub has already been marked as
+ * baling in a modifier or rdregion then we cannot bale it in to the
+ * variable index region.
+ *
+ * This pass also clones any instruction that can be baled in but has
+ * multiple uses. A baled in instruction must have exactly one use.
+ */
+bool GenXBaling::processFunction(Function *F)
+{
+  bool Changed = prologue(F);
+
+  for (df_iterator<BasicBlock *> i = df_begin(&F->getEntryBlock()),
+      e = df_end(&F->getEntryBlock()); i != e; ++i) {
+    for (BasicBlock::iterator bi = i->begin(), be = i->end(); bi != be; ) {
+      Instruction *Inst = &*bi;
+      ++bi; // increment here as Inst may be erased
+      processInst(Inst);
+    }
+  }
+  // Process any two addr sends we found.
+  for (auto i = TwoAddrSends.begin(), e = TwoAddrSends.end(); i != e; ++i)
+    processTwoAddrSend(*i);
+  TwoAddrSends.clear();
+  // Clone any instructions that we found in the pass that want to be baled in
+  // but have more than one use.
+  if (NeedCloneStack.size()) {
+    doClones();
+    Changed = true;
+  }
+  return Changed;
+}
+
+/***********************************************************************
+ * processInst : calculate baling for an instruction
+ *
+ * Usually this is called from runOnFunction above. However another pass
+ * can call this to recalculate the baling for an instruction, particularly
+ * for a new instruction it has just added. GenXLegalization does this.
+ */
+void GenXBaling::processInst(Instruction *Inst)
+{
+  unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(Inst);
+  if (GenXIntrinsic::isWrRegion(IntrinID))
+    processWrRegion(Inst);
+  else if (IntrinID == GenXIntrinsic::genx_wrpredregion)
+    processWrPredRegion(Inst);
+  else if (IntrinID == GenXIntrinsic::genx_wrpredpredregion)
+    processWrPredPredRegion(Inst);
+  else if (IntrinID == GenXIntrinsic::genx_sat || GenXIntrinsic::isIntegerSat(IntrinID))
+    processSat(Inst);
+  else if (GenXIntrinsic::isRdRegion(IntrinID))
+    processRdRegion(Inst);
+  else if (BranchInst *Branch = dyn_cast<BranchInst>(Inst))
+    processBranch(Branch);
+  else if (auto SI = dyn_cast<StoreInst>(Inst))
+    processStore(SI);
+  else if (isa<CallInst>(Inst) && cast<CallInst>(Inst)->isInlineAsm())
+    processInlineAsm(Inst);
+  else if(ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Inst))
+    processExtractValue(EV);
+  else if (isa<PtrToIntInst>(Inst) && cast<PtrToIntInst>(Inst)
+                                          ->getPointerOperand()
+                                          ->getType()
+                                          ->getPointerElementType()
+                                          ->isFunctionTy())
+    processFuncPointer(cast<PtrToIntInst>(Inst));
+  else {
+    // Try to bale a select into cmp's dst. If failed, continue to process
+    // select as a main instruction.
+    bool BaledSelect = processSelect(Inst);
+    if (!BaledSelect)
+      processMainInst(Inst, IntrinID);
+  }
+}
+
+/***********************************************************************
+ * static isRegionOKForIntrinsic : check whether region is OK for an intrinsic arg
+ *
+ * Enter:   ArgInfoBits = mask for the ArgInfo for the intrinsic arg (or return value)
+ *          R = region itself
+ *          ST = check for this subtarget
+ *          AlignInfo = alignment info if provided (can be nullptr)
+ *          BKind = check before this baling type
+ *
+ * This checks that the arg is general (rather than raw) and does not have
+ * any stride restrictions that are incompatible with the region.
+ *
+ * In the legalization pass of baling, we always return true when the main 
+ * instruction can be splitted. Otherwise, a region that would be OK after
+ * being split by legalization might here appear not OK, and that would stop
+ * legalization considering splitting it. However, if the main instruction
+ * cannot be splitted, then we need to check the full restriction
+ * otherwise, if the region is considered baled and skip legalization, 
+ * we may have illegal standalone read-region.
+ */
+bool GenXBaling::isRegionOKForIntrinsic(unsigned ArgInfoBits, const Region &R,
+                                        bool CanSplitBale,
+                                        const GenXSubtarget *ST,
+                                        genx::AlignmentInfo * AlignInfo,
+                                        BalingKind BKind) {
+  GenXIntrinsicInfo::ArgInfo AI(ArgInfoBits);
+  if (!AI.isGeneral())
+    return false;
+  if (BKind == BalingKind::BK_Legalization) {
+    if (CanSplitBale)
+      return true;
+  }
+  if (R.Indirect && (AI.Info & GenXIntrinsicInfo::DIRECTONLY))
+    return false;
+  unsigned Restriction = AI.getRestriction();
+  if (!Restriction)
+    return true;
+  unsigned GRFWidth = ST ? ST->getGRFWidth() : 32;
+  unsigned ElementsPerGrf = GRFWidth / R.ElementBytes;
+  unsigned GRFLogAlign = Log2_32(GRFWidth);
+  if (AI.Info & GenXIntrinsicInfo::GRFALIGNED) {
+    if (R.Indirect) {
+      // Instructions that cannot be splitted also cannot allow indirect
+      if (!CanSplitBale)
+        return false;
+      if (!AlignInfo)
+        return false;
+      Alignment AL = AlignInfo->get(R.Indirect);
+      if (AL.getLogAlign() < GRFLogAlign || AL.getExtraBits() != 0)
+        return false;
+    } else if (R.Offset & (GRFWidth - 1))
+      return false;
+    if (R.is2D() && (R.VStride & (ElementsPerGrf - 1)))
+      return false;
+  }
+  if (AI.Info & GenXIntrinsicInfo::OWALIGNED) {
+    // Instructions that cannot be splitted also cannot allow indirect
+    if (R.Indirect) {
+      if (!CanSplitBale)
+        return false;
+      if (!AlignInfo)
+        return false;
+      Alignment AL = AlignInfo->get(R.Indirect);
+      if (AL.getLogAlign() < 4 || AL.getExtraBits() != 0)
+        return false;
+    }
+    if (R.Offset & 15)
+      return false;
+    if (R.is2D() && (R.VStride & ((ElementsPerGrf >> 1) - 1)))
+      return false;
+  }
+  switch (Restriction) {
+  case GenXIntrinsicInfo::SCALARORCONTIGUOUS:
+    if (!R.Stride && R.Width == R.NumElements)
+      break;
+    // fall through...
+  case GenXIntrinsicInfo::FIXED4:
+  case GenXIntrinsicInfo::CONTIGUOUS:
+    if (R.Stride != 1 || R.Width != R.NumElements)
+      return false;
+    break;
+  case GenXIntrinsicInfo::STRIDE1:
+    // For the dot product instructions, the vISA spec just says that the
+    // horizontal stride must be 1. It doesn't say anything about the
+    // width or the vertical stride. I am assuming that the width must also
+    // be at least 4, since the operation works on groups of 4 channels.
+    if (R.Stride != 1 || R.Width < 4)
+      return false;
+    break;
+  default:
+    break;
+  }
+  return true;
+}
+
+/***********************************************************************
+ * checkModifier : check whether instruction is a source modifier
+ *
+ * Enter:   Inst = instruction to check
+ *
+ * Return:  ABSMOD, NEGMOD, NOTMOD, ZEXT, SEXT or MAININST (0) if not modifier
+ */
+static int checkModifier(Instruction *Inst)
+{
+  switch (Inst->getOpcode()) {
+    case Instruction::Sub:
+    case Instruction::FSub:
+      // Negate is represented in LLVM IR by subtract from 0.
+      if (Constant *Lhs = dyn_cast<Constant>(Inst->getOperand(0))) {
+        // Canonicalize splats as well
+        if (isa<VectorType>(Lhs->getType()))
+          if (auto splat = Lhs->getSplatValue())
+            Lhs = splat;
+
+        if (Lhs->isZeroValue())
+          return BaleInfo::NEGMOD;
+      }
+      break;
+    case Instruction::Xor:
+      if (isIntNot(Inst))
+        return BaleInfo::NOTMOD;
+      break;
+    case Instruction::ZExt:
+      if (!Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1))
+        return BaleInfo::ZEXT;
+      break;
+    case Instruction::SExt:
+      if (!Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1))
+        return BaleInfo::SEXT;
+      break;
+    default:
+      switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+        case GenXIntrinsic::genx_absi:
+        case GenXIntrinsic::genx_absf:
+          return BaleInfo::ABSMOD;
+        default:
+          break;
+      }
+      break;
+  }
+  return BaleInfo::MAININST;
+}
+
+/***********************************************************************
+ * operandIsBaled : check if a main inst is baled
+ *
+ * Enter:   Inst = the main inst
+ *          OperandNum = operand number to look at
+ *          ModType = what type of modifier (arith/logic/extonly/none) this
+ *                    operand accepts
+ *          AI = GenXIntrinsicInfo::ArgInfo, so we can see any stride
+ *               restrictions, omitted if Inst is not an intrinsic
+ */
+bool
+GenXBaling::operandIsBaled(Instruction *Inst,
+               unsigned OperandNum, int ModType,
+               unsigned ArgInfoBits = GenXIntrinsicInfo::GENERAL) {
+  GenXIntrinsicInfo::ArgInfo AI(ArgInfoBits);
+  Instruction *Opnd = dyn_cast<Instruction>(Inst->getOperand(OperandNum));
+  if (!Opnd)
+    return false;
+  // Check for source operand modifier.
+  if (ModType != GenXIntrinsicInfo::MODIFIER_DEFAULT) {
+    int Mod = checkModifier(Opnd);
+    switch (Mod) {
+      case BaleInfo::MAININST:
+        break;
+      case BaleInfo::ZEXT:
+      case BaleInfo::SEXT:
+        if (ModType != GenXIntrinsicInfo::MODIFIER_DEFAULT)
+          return true;
+        break;
+      case BaleInfo::NOTMOD:
+        if (ModType == GenXIntrinsicInfo::MODIFIER_LOGIC)
+          return true;
+        break;
+      case BaleInfo::ABSMOD:
+        // Part of the bodge to allow abs to be baled in to zext/sext.
+        if (ModType == MODIFIER_ABSONLY)
+          return true;
+        // fall through...
+      default:
+        if (ModType == GenXIntrinsicInfo::MODIFIER_ARITH)
+          return true;
+        break;
+    }
+  }
+  if (GenXIntrinsic::isRdRegion(Opnd)) {
+    // The operand is a rdregion. Check any restrictions.
+    // (Note we call isRegionOKForIntrinsic even when Inst is not an
+    // intrinsic, since in that case AI is initialized to a state
+    // where there are no region restrictions.)
+    bool CanSplitBale = true;
+    Region RdR(Opnd, BaleInfo());
+    if (!isRegionOKForIntrinsic(AI.Info, RdR, CanSplitBale, ST, &AlignInfo,
+                                Kind))
+      return false;
+
+    // Do not bale in a region read with multiple uses if
+    // - any use is bitcast, or
+    // - it is indirect.
+    // as bitcast will not bale its operands and indirect multiple-use region
+    // reads often lead to narrow simd width after legalization.
+    if (Opnd->getNumUses() > 1 && (Kind == BalingKind::BK_Legalization ||
+                                   Kind == BalingKind::BK_Analysis)) {
+      for (auto U : Opnd->users())
+        if (isa<BitCastInst>(U))
+          return false;
+      Region R(cast<CallInst>(Opnd), BaleInfo());
+      if (R.Indirect)
+        return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+/***********************************************************************
+ * processWrPredRegion : set up baling info for wrpredregion
+ *
+ * The input to wrpredregion may be the following:
+ * 1) icmp or fcmp, in which case it is always baled.
+ * 2) constant, which may resulted from region simplification.
+ */
+void GenXBaling::processWrPredRegion(Instruction *Inst)
+{
+  Value *V = Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+  assert(isa<CmpInst>(V) || isa<Constant>(V));
+  BaleInfo BI(BaleInfo::WRPREDREGION);
+  if (isa<CmpInst>(V)) {
+    setOperandBaled(Inst, GenXIntrinsic::GenXRegion::NewValueOperandNum, &BI);
+  }
+  setBaleInfo(Inst, BI);
+}
+
+/***********************************************************************
+ * processWrPredPredRegion : set up baling info for wrpredpredregion
+ *
+ * The "new value" input to wrpredregion must be icmp or fcmp, and it is always
+ * baled.
+ *
+ * The condition input is assumed to be EM. But it might be an rdpredregion
+ * out of EM, in which case the rdpredregion is baled. The rdpredregion must
+ * have offset 0.
+ */
+void GenXBaling::processWrPredPredRegion(Instruction *Inst)
+{
+  assert(isa<CmpInst>(Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)));
+  BaleInfo BI(BaleInfo::WRPREDPREDREGION);
+  setOperandBaled(Inst, GenXIntrinsic::GenXRegion::NewValueOperandNum, &BI);
+  Value *Cond = Inst->getOperand(3);
+  if (GenXIntrinsic::getGenXIntrinsicID(Cond) == GenXIntrinsic::genx_rdpredregion) {
+    assert(cast<Constant>(cast<CallInst>(Cond)->getOperand(1))->isNullValue());
+    setOperandBaled(Inst, 3, &BI);
+  }
+  setBaleInfo(Inst, BI);
+}
+
+/***********************************************************************
+ * processWrRegion : set up baling info for wrregion
+ */
+void GenXBaling::processWrRegion(Instruction *Inst)
+{
+  BaleInfo BI(BaleInfo::WRREGION);
+  // Get the instruction (if any) that creates the element/subregion to write.
+  unsigned OperandNum = GenXIntrinsic::GenXRegion::NewValueOperandNum;
+  Instruction *V = dyn_cast<Instruction>(Inst->getOperand(OperandNum));
+  if (V && !V->hasOneUse()) {
+    // The instruction has multiple uses.
+    // We don't want to bale in the following cases, as they seem to make the
+    // code worse, unless this is load from a global variable.
+    if (V->getParent() != Inst->getParent()) {
+      auto isRegionFromGlobalLoad = [](Value *V) {
+        if (!GenXIntrinsic::isRdRegion(V))
+          return false;
+        auto LI = dyn_cast<LoadInst>(cast<CallInst>(V)->getArgOperand(0));
+        return LI && getUnderlyingGlobalVariable(LI->getPointerOperand());
+      };
+      // 0. It is in a different basic block to the wrregion.
+      if (!isRegionFromGlobalLoad(V))
+        V = nullptr;
+    } else {
+      // 1. The maininst is a select.
+      Bale B;
+      buildBale(V, &B);
+      if (auto MainInst = B.getMainInst()) {
+        if (isa<SelectInst>(MainInst->Inst) ||
+            isHighCostBaling(BaleInfo::WRREGION, MainInst->Inst))
+          V = nullptr;
+      }
+      // 2. There is an indirect rdregion with a constant offset (probably due to
+      // the risk of the jitter doing unfolding; this check may be unnecessary
+      // after HSW).
+      for (auto i = B.begin(), e = B.end(); i != e; ++i) {
+        if (i->Info.Type != BaleInfo::RDREGION)
+          continue;
+        if (!isa<Constant>(i->Inst->getOperand(
+                GenXIntrinsic::GenXRegion::RdIndexOperandNum))) {
+          V = nullptr;
+          break;
+        }
+      }
+    }
+    // FIXME: Baling on WRREGION is not the right way to reduce the overhead
+    // from `wrregion`. Instead, register coalescing should be applied to
+    // enable direct defining of the WRREGION and minimize the value
+    // duplication.
+  }
+  if (V) {
+    Region WrR(Inst, BaleInfo());
+    if (isBalableNewValueIntoWrr(V, WrR, ST, &AlignInfo, Kind)) {
+      setOperandBaled(Inst, OperandNum, &BI);
+      if (Liveness) {
+        // Ensure the wrregion's result has an
+        // alignment of 32 if intrinsic with
+        // raw result was baled into
+        unsigned ValIntrinID = GenXIntrinsic::getAnyIntrinsicID(V);
+        GenXIntrinsicInfo II(ValIntrinID);
+        if (GenXIntrinsic::isGenXIntrinsic(ValIntrinID) &&
+            (ValIntrinID != GenXIntrinsic::genx_sat) &&
+            !GenXIntrinsic::isRdRegion(V) && !GenXIntrinsic::isWrRegion(V) &&
+            (II.getRetInfo().getCategory() == GenXIntrinsicInfo::RAW))
+          Liveness->getOrCreateLiveRange(Inst)->LogAlignment = 5;
+      }
+    }
+  }
+  // Now see if there is a variable index with an add/sub with an in range
+  // offset that we can bale in, such that the add/sub does not already
+  // bale in other instructions.
+  OperandNum = 5;
+  if (isBalableIndexAdd(Inst->getOperand(OperandNum))) {
+    setOperandBaled(Inst, OperandNum, &BI);
+    // We always set up InstMap for an address add, even though it does not
+    // bale in any operands.
+    setBaleInfo(cast<Instruction>(Inst->getOperand(OperandNum)), BaleInfo(BaleInfo::ADDRADD, 0));
+  }
+  // See if there is any baling in to the predicate (mask) operand.
+  if (processPredicate(Inst, GenXIntrinsic::GenXRegion::PredicateOperandNum))
+    setOperandBaled(Inst, GenXIntrinsic::GenXRegion::PredicateOperandNum, &BI);
+  // We always set up InstMap for a wrregion, even if it does not bale in any
+  // operands.
+  setBaleInfo(Inst, BI);
+}
+
+// Process a select instruction. Return true if it can be baled into a cmp
+// instruction, false otherwise.
+bool GenXBaling::processSelect(Instruction *Inst) {
+  auto SI = dyn_cast<SelectInst>(Inst);
+  if (!SI || !SI->getType()->isVectorTy())
+    return false;
+
+  // Only bale into a cmp instruction.
+  Value *Cond = SI->getCondition();
+  if (!isa<CmpInst>(Cond) || !Cond->getType()->isVectorTy() ||
+      !Cond->hasOneUse())
+    return false;
+
+  // Only bale "select cond, -1, 0"
+  Constant *Src0 = dyn_cast<Constant>(SI->getTrueValue());
+  Constant *Src1 = dyn_cast<Constant>(SI->getFalseValue());
+  if (Src0 && Src0->isAllOnesValue() && Src1 && Src1->isNullValue()) {
+    BaleInfo BI(BaleInfo::CMPDST);
+    unsigned OperandNum = 0;
+    setOperandBaled(Inst, OperandNum, &BI);
+    setBaleInfo(Inst, BI);
+  }
+
+  // No baling.
+  return false;
+}
+
+// Process a store instruction.
+void GenXBaling::processStore(StoreInst *Inst) {
+  BaleInfo BI(BaleInfo::GSTORE);
+  unsigned OperandNum = 0;
+  Instruction *V = dyn_cast<Instruction>(Inst->getOperand(OperandNum));
+  if (GenXIntrinsic::isWrRegion(V))
+    setOperandBaled(Inst, OperandNum, &BI);
+  else if (isa<CallInst>(V) && cast<CallInst>(V)->isInlineAsm())
+    setOperandBaled(Inst, OperandNum, &BI);
+  setBaleInfo(Inst, BI);
+}
+
+// We can bale in shufflevector of predicate if it is replicated slice.
+bool GenXBaling::processShufflePred(Instruction *Inst) {
+  assert(Inst->getType()->getScalarSizeInBits() == 1 &&
+         "Expected bool shuffle");
+  auto *SI = dyn_cast<ShuffleVectorInst>(Inst);
+  if (!SI)
+    return false;
+
+  assert(ShuffleVectorAnalyzer(SI).isReplicatedSlice() &&
+         "Predicate shuffle is not replicated slice!");
+  BaleInfo BI(BaleInfo::SHUFFLEPRED);
+  setBaleInfo(SI, BI);
+  return true;
+}
+
+/***********************************************************************
+ * processPredicate : process predicate operand (to wrregion or branch)
+ *
+ * Enter:   Inst = instruction with predicate operand
+ *          OperandNum = operand number in Inst
+ *
+ * Return:  whether operand can be baled in
+ *
+ * If the function returns true, the caller needs to call
+ * setOperandBaled(Inst, OperandNum, &BI) to actually bale it in.
+ *
+ * Unlike most baling, which proceeds in code order building a tree of baled in
+ * instructions, this function recurses, scanning backward through the code,
+ * because we only want to bale predicate operations all/any/not/rdpredregion
+ * once we know that the resulting predicate is used in wrregion or branch (as
+ * opposed to say a bitcast to int).
+ *
+ * So this function decides whether OperandNum in Inst is an instruction that
+ * is to be baled in, and additionally performs any further baling in to that
+ * instruction.
+ */
+bool GenXBaling::processPredicate(Instruction *Inst, unsigned OperandNum) {
+  Instruction *Mask = dyn_cast<Instruction>(Inst->getOperand(OperandNum));
+  if (!Mask)
+    return false;
+
+  if (Kind == BalingKind::BK_CodeGen && !isa<VectorType>(Mask->getType())) {
+    if (auto Extract = dyn_cast<ExtractValueInst>(Mask)) {
+      auto *GotoJoin = cast<Instruction>(Extract->getAggregateOperand());
+      auto IID = GenXIntrinsic::getAnyIntrinsicID(GotoJoin);
+      if (IID == GenXIntrinsic::genx_simdcf_goto
+          || IID == GenXIntrinsic::genx_simdcf_join) {
+        // Second pass: Mask is the extractvalue of the !any(EM) result out of
+        // the result of goto/join. We mark both the use of the extract in the
+        // branch and the use of the goto/join in the extract as baled. The
+        // former is done by the caller when we return true.
+        BaleInfo BI;
+        setOperandBaled(Mask, /*OperandNum=*/0, &BI);
+        setBaleInfo(Mask, BI);
+        return true;
+      }
+    }
+  }
+  switch (GenXIntrinsic::getGenXIntrinsicID(Mask)) {
+    case GenXIntrinsic::genx_rdpredregion: {
+      if (Kind == BalingKind::BK_CodeGen) {
+#if _DEBUG
+        // Sanity check the offset and number of elements being accessed.
+        unsigned MinSize = Inst->getType()->getScalarType()->getPrimitiveSizeInBits() == 64 ? 4 : 8;
+        unsigned NElems = Mask->getType()->getVectorNumElements();
+        unsigned Offset = dyn_cast<ConstantInt>(Mask->getOperand(1))->getZExtValue();
+        assert(exactLog2(NElems) >= 0 && (Offset & (std::min(NElems, MinSize) - 1)) == 0 &&
+               "illegal offset and/or width in rdpredregion");
+#endif
+      }
+      // We always set up InstMap for an rdpredregion, even though it does not
+      // bale in any operands.
+      setBaleInfo(Mask, BaleInfo(BaleInfo::RDPREDREGION, 0));
+      return true;
+    }
+    case GenXIntrinsic::genx_all:
+    case GenXIntrinsic::genx_any: {
+        if (Kind != BalingKind::BK_CodeGen)
+          return false; // only bale all/any for CodeGen
+        // The mask is the result of an all/any. Bale that in.
+        // Also see if its operand can be baled in.
+        BaleInfo BI(BaleInfo::ALLANY);
+        if (processPredicate(Mask, /*OperandNum=*/0))
+          setOperandBaled(Mask, /*OperandNum=*/0, &BI);
+        setBaleInfo(Mask, BI);
+        return true;
+      }
+    default:
+      break;
+  }
+
+  if (isNot(Mask)) {
+    // The mask is the result of a notp. Bale that in.
+    // Also see if its operand can be baled in.
+    BaleInfo BI(BaleInfo::NOTP);
+    if (processPredicate(Mask, /*OperandNum=*/0))
+      setOperandBaled(Mask, /*OperandNum=*/0, &BI);
+    setBaleInfo(Mask, BI);
+    return true;
+  }
+
+  if (processShufflePred(Mask))
+    return true;
+
+  return false;
+}
+
+/***********************************************************************
+ * processSat : set up baling info fp saturate
+ */
+void GenXBaling::processSat(Instruction *Inst)
+{
+  BaleInfo BI(BaleInfo::SATURATE);
+  // Get the instruction (if any) that creates value to saturate.
+  unsigned OperandNum = 0;
+  Instruction *V = dyn_cast<Instruction>(Inst->getOperand(OperandNum));
+  if (V && V->hasOneUse()) {
+    // It is an instruction where we are the only use. We can bale it in, if
+    // it is a suitable instruction.
+    auto ValIntrinID = GenXIntrinsic::getAnyIntrinsicID(V);
+    if (GenXIntrinsic::isRdRegion(ValIntrinID))
+      setOperandBaled(Inst, OperandNum, &BI);
+    else if (ValIntrinID==GenXIntrinsic::not_any_intrinsic) {
+      if (isa<BinaryOperator>(V) || (isa<CastInst>(V) && !isa<BitCastInst>(V)))
+        setOperandBaled(Inst, OperandNum, &BI);
+    } else if (!GenXIntrinsic::isWrRegion(ValIntrinID)) {
+      // V is an intrinsic other than rdregion/wrregion. Check that its return
+      // value is suitable for baling.
+      GenXIntrinsicInfo II(ValIntrinID);
+      if (!II.getRetInfo().isRaw() && II.getRetInfo().getSaturation() ==
+                                          GenXIntrinsicInfo::SATURATION_DEFAULT)
+        setOperandBaled(Inst, OperandNum, &BI);
+    }
+  }
+  // We always set up InstMap for a saturate, even if it does not bale in any
+  // operands.
+  setBaleInfo(Inst, BI);
+}
+
+/***********************************************************************
+ * processRdRegion : set up baling info for rdregion
+ */
+void GenXBaling::processRdRegion(Instruction *Inst)
+{
+  // See if there is a variable index with an add/sub with an in range
+  // offset that we can bale in, such that the add/sub does not already
+  // bale in other instructions.
+  const unsigned OperandNum = 4; // operand number of index in rdregion
+  BaleInfo BI(BaleInfo::RDREGION);
+  if (isBalableIndexAdd(Inst->getOperand(OperandNum))) {
+    setOperandBaled(Inst, OperandNum, &BI);
+    // We always set up InstMap for an address add, even though it does not
+    // bale in any operands.
+    setBaleInfo(cast<Instruction>(Inst->getOperand(OperandNum)), BaleInfo(BaleInfo::ADDRADD, 0));
+  } else if (isBalableIndexOr(Inst->getOperand(OperandNum))) {
+    setOperandBaled(Inst, OperandNum, &BI);
+    // We always set up InstMap for an address or, even though it does not
+    // bale in any operands.
+    setBaleInfo(cast<Instruction>(Inst->getOperand(OperandNum)), BaleInfo(BaleInfo::ADDROR, 0));
+  }
+  // We always set up InstMap for a rdregion, even if it does not bale in any
+  // operands.
+  setBaleInfo(Inst, BI);
+}
+
+/***********************************************************************
+ * processInlineAsm : RdRegion result a baled into inline asm
+ *                    instruction. Inline Assembly iremains the main instruction
+ *                    of the bale.
+ */
+void GenXBaling::processInlineAsm(Instruction *Inst) {
+  auto CI = dyn_cast<CallInst>(Inst);
+  assert((CI && CI->isInlineAsm()) && "Inline Asm expected");
+
+  BaleInfo BI(BaleInfo::MAININST);
+  for (unsigned I = 0; I < CI->getNumArgOperands(); I++)
+    if (auto RdR = dyn_cast<Instruction>(CI->getArgOperand(I)))
+      if (GenXIntrinsic::isRdRegion(RdR)) {
+        switch (GenXIntrinsic::getGenXIntrinsicID(RdR->getOperand(0))) {
+        default:
+          setOperandBaled(Inst, I, &BI);
+          break;
+        case GenXIntrinsic::genx_constanti:
+        case GenXIntrinsic::genx_constantf:
+          continue;
+        }
+      }
+
+  setBaleInfo(Inst, BI);
+}
+
+void GenXBaling::processFuncPointer(PtrToIntInst *Inst) {
+  BaleInfo BI(BaleInfo::FADDR);
+  for (auto *U : Inst->users()) {
+    if (isa<SelectInst>(U)) {
+      // need to clone wrregion sinking to select
+      // (can't do that on FuncPtrs lowering as it's actually
+      //  a result of post-legalization)
+      // to achieve 3 bales:
+      //  b1=FADDR         b2=FADDR
+      // |ptrtoint|       |ptrtoint|
+      // |   |    |       |   |    |
+      // |   |    |       |   |    |
+      // |  wrr   |       |  wrr   |
+      //     \               /
+      //      \             /
+      //          |select|
+      //          b3=select
+      assert(Inst->hasOneUse());
+      auto &DL = Inst->getModule()->getDataLayout();
+      Region R(IntegerType::get(Inst->getContext(), 64), &DL);
+      auto NewWrr = R.createWrRegion(
+          UndefValue::get(IntegerType::get(Inst->getContext(), 64)), Inst,
+          Inst->getName(), Inst, Inst->getDebugLoc());
+      U->replaceUsesOfWith(Inst, NewWrr);
+    } else if (isa<BitCastInst>(U)) {
+      // only bitcast -> rdregion are allowed
+      // this is typical for vector selects
+      assert(Inst->hasOneUse() && U->hasOneUse() &&
+             isa<CallInst>(U->user_back()) &&
+             GenXIntrinsic::isRdRegion(U->user_back()));
+      setBaleInfo(Inst, BI);
+      return;
+    }
+  }
+
+  assert(Inst->hasOneUse() && isa<CallInst>(Inst->use_begin()->getUser()) &&
+         GenXIntrinsic::isWrRegion(Inst->use_begin()->getUser()));
+
+  setBaleInfo(Inst, BI);
+}
+
+/***********************************************************************
+ * processExtractValue : Extract instructions can get elements from structure
+ *                       which was a result of inline assembly call with multiple outputs.
+ */
+void GenXBaling::processExtractValue(ExtractValueInst *EV) {
+  assert(EV);
+  if (auto CI = dyn_cast<CallInst>(EV->getAggregateOperand()))
+    if (CI->isInlineAsm())
+      setBaleInfo(EV, BaleInfo(BaleInfo::MAININST, 0));
+}
+
+/***********************************************************************
+ * static getIndexAdd : test whether the specified value is
+ *        a constant add/sub that could be baled in as a variable index offset,
+ *        but without checking that the index is in range
+ *
+ * Enter:   V = the value that might be a constant add/sub
+ *          Offset = where to store the offset of the constant add/sub
+ *
+ * Return:  true if a constant add/sub was detected
+ *
+ * For the second run of GenXBaling, which is after GenXCategoryConversion,
+ * we are looking for an llvm.genx.add.addr rather than a real add/sub.
+ */
+bool GenXBaling::getIndexAdd(Value *V, int *Offset)
+{
+  if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+    int IsConstAdd = 0;
+    switch (Inst->getOpcode()) {
+      case Instruction::Add:
+        IsConstAdd = 1;
+        break;
+      case Instruction::Sub:
+        IsConstAdd = -1;
+        break;
+      default:
+        if (GenXIntrinsic::getGenXIntrinsicID(Inst) ==
+            GenXIntrinsic::genx_add_addr)
+          IsConstAdd = 1;
+        break;
+    }
+    if (IsConstAdd) {
+      if (Constant *C = dyn_cast<Constant>(Inst->getOperand(1))) {
+        if (isa<VectorType>(C->getType()))
+          C = C->getSplatValue();
+        if (C) {
+          if (C->isNullValue()) {
+            *Offset = 0;
+            return true;
+          }
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+            // It is a constant add/sub.
+            *Offset = CI->getSExtValue() * IsConstAdd;
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/***********************************************************************
+ * static getIndexOr : test whether the specified value is
+ *        a constant Or that could be baled in as a variable index offset,
+ *        but without checking that the index is in range
+ *
+ * Enter:   V = the value that might be a constant or
+ *          Offset = where to store the offset of the constant or
+ *
+ * Return:  true if a constant or was detected
+ */
+bool GenXBaling::getIndexOr(Value *V, int &Offset)
+{
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return false;
+
+  if (Inst->getOpcode() != Instruction::Or)
+    return false;
+
+  // inst is Or from this point
+  Constant *C = dyn_cast<Constant>(Inst->getOperand(1));
+  if (!C)
+    return false;
+
+  if (isa<VectorType>(C->getType()))
+    C = C->getSplatValue();
+
+  // getSplatValue could return nullptr
+  if (!C)
+    return false;
+
+  if (C->isNullValue()) {
+    Offset = 0;
+    return true;
+  }
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+    // check for or could be changed to add
+    if(!haveNoCommonBitsSet(Inst->getOperand(0), Inst->getOperand(1),
+                              Inst->getModule()->getDataLayout()))
+    {
+      return false;
+    }
+    Offset = CI->getSExtValue();
+    return true;
+  }
+  return false;
+}
+
+/***********************************************************************
+ * static isBalableIndexAdd : test whether the specified value is
+ *        a constant add/sub that could be baled in as a variable index offset
+ *
+ * For the second run of GenXBaling, which is after GenXCategoryConversion,
+ * we are looking for an llvm.genx.add.addr rather than a real add/sub.
+ */
+bool GenXBaling::isBalableIndexAdd(Value *V)
+{
+  int Offset;
+  if (!getIndexAdd(V, &Offset))
+    return false;
+  // It is a constant add/sub. Check the constant is in range.
+  return ( G4_MIN_ADDR_IMM <= Offset && Offset <= G4_MAX_ADDR_IMM);
+}
+
+/***********************************************************************
+ * static isBalableIndexOr : test whether the specified value is
+ *        a constant Or that could be baled in as a variable index offset
+ */
+bool GenXBaling::isBalableIndexOr(Value *V)
+{
+  int Offset;
+  if (!getIndexOr(V, Offset))
+    return false;
+  assert(Offset >=0 && "Offset in or appears to be less than zero");
+  // It is a constant or. Check the constant is in range.
+  return (Offset  <= G4_MAX_ADDR_IMM);
+}
+
+/***********************************************************************
+ * static isBalableNewValueIntoWrr : check whether the new val operand can
+ *  be baled into wrr instruction
+ */
+bool GenXBaling::isBalableNewValueIntoWrr(Value *V, const Region &WrrR,
+                                          const GenXSubtarget *ST,
+                                          genx::AlignmentInfo *AlignInfo,
+                                          BalingKind BKind) {
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return false;
+  // It is an instruction. We can bale it in, if it is a suitable
+  // instruction.
+  unsigned ValIntrinID = GenXIntrinsic::getAnyIntrinsicID(Inst);
+  if (ValIntrinID == GenXIntrinsic::genx_sat ||
+      GenXIntrinsic::isRdRegion(ValIntrinID))
+    return true;
+  else if (ValIntrinID == GenXIntrinsic::not_any_intrinsic) {
+    if (isa<BinaryOperator>(Inst) ||
+        (isa<CastInst>(Inst) && !isa<BitCastInst>(Inst)))
+      return true;
+    else if (isMaskPacking(Inst))
+      return true;
+    else if (isa<CallInst>(Inst) && cast<CallInst>(Inst)->isInlineAsm())
+      return true;
+    else if (isa<SelectInst>(Inst) && !WrrR.Mask) {
+      // Can bale in a select as long as the wrregion is unpredicated.
+      return true;
+    } else if (isa<ExtractValueInst>(Inst)) {
+      // Each extract bales into its own WrRegionand remains
+      // the main instruction of the bale
+      auto Extract = cast<ExtractValueInst>(Inst);
+      if (auto CI = dyn_cast<CallInst>(Extract->getAggregateOperand()))
+        if (CI->isInlineAsm())
+          return true;
+    }
+  } else if (!GenXIntrinsic::isWrRegion(ValIntrinID)) {
+    // V is an intrinsic other than rdregion/wrregion. If this is a
+    // predicated wrregion, only permit baling in if the intrinsic
+    // supports a predicate mask.
+    GenXIntrinsicInfo II(ValIntrinID);
+
+    if (WrrR.Mask == 0 || II.getPredAllowed()) {
+      // Check that its return value is suitable for baling.
+      GenXIntrinsicInfo::ArgInfo AI = II.getRetInfo();
+      switch (AI.getCategory()) {
+      case GenXIntrinsicInfo::GENERAL: {
+        bool CanSplitBale = true;
+        if (isRegionOKForIntrinsic(AI.Info, WrrR, CanSplitBale, ST, AlignInfo,
+                                   BKind))
+          return true;
+      } break;
+      case GenXIntrinsicInfo::RAW: {
+        // Intrinsic with raw result can be baled in to wrregion as long as
+        // it is unstrided and starts on a GRF boundary, and there is no
+        // non-undef TWOADDR operand.
+        if (isRegionOKForRaw(WrrR, ST)) {
+          unsigned FinalCallArgIdx = Inst->getNumOperands() - 2;
+          if (isa<UndefValue>(Inst->getOperand(FinalCallArgIdx)))
+            return true;
+          else {
+            GenXIntrinsicInfo::ArgInfo AI2 = II.getArgInfo(FinalCallArgIdx);
+            if (AI2.getCategory() != GenXIntrinsicInfo::TWOADDR)
+              return true;
+          }
+        }
+      } break;
+      }
+    }
+  }
+  return false;
+}
+
+bool GenXBaling::isHighCostBaling(uint16_t Type, Instruction *Inst) {
+  switch (Type) {
+  case BaleInfo::WRREGION:
+    switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+    case GenXIntrinsic::genx_dword_atomic_add:
+    case GenXIntrinsic::genx_dword_atomic_sub:
+    case GenXIntrinsic::genx_dword_atomic_min:
+    case GenXIntrinsic::genx_dword_atomic_max:
+    case GenXIntrinsic::genx_dword_atomic_xchg:
+    case GenXIntrinsic::genx_dword_atomic_or:
+    case GenXIntrinsic::genx_dword_atomic_xor:
+    case GenXIntrinsic::genx_dword_atomic_imin:
+    case GenXIntrinsic::genx_dword_atomic_imax:
+    case GenXIntrinsic::genx_dword_atomic_fmin:
+    case GenXIntrinsic::genx_dword_atomic_fmax:
+    case GenXIntrinsic::genx_dword_atomic_inc:
+    case GenXIntrinsic::genx_dword_atomic_dec:
+    case GenXIntrinsic::genx_dword_atomic_cmpxchg:
+    case GenXIntrinsic::genx_dword_atomic_fcmpwr:
+    case GenXIntrinsic::genx_typed_atomic_add:
+    case GenXIntrinsic::genx_typed_atomic_sub:
+    case GenXIntrinsic::genx_typed_atomic_min:
+    case GenXIntrinsic::genx_typed_atomic_max:
+    case GenXIntrinsic::genx_typed_atomic_xchg:
+    case GenXIntrinsic::genx_typed_atomic_and:
+    case GenXIntrinsic::genx_typed_atomic_or:
+    case GenXIntrinsic::genx_typed_atomic_xor:
+    case GenXIntrinsic::genx_typed_atomic_imin:
+    case GenXIntrinsic::genx_typed_atomic_imax:
+    case GenXIntrinsic::genx_typed_atomic_fmin:
+    case GenXIntrinsic::genx_typed_atomic_fmax:
+    case GenXIntrinsic::genx_typed_atomic_inc:
+    case GenXIntrinsic::genx_typed_atomic_dec:
+    case GenXIntrinsic::genx_typed_atomic_cmpxchg:
+    case GenXIntrinsic::genx_typed_atomic_fcmpwr:
+    case GenXIntrinsic::genx_gather_scaled:
+    case GenXIntrinsic::genx_gather4_scaled:
+    case GenXIntrinsic::genx_gather4_typed:
+    case GenXIntrinsic::genx_media_ld:
+    case GenXIntrinsic::genx_oword_ld:
+    case GenXIntrinsic::genx_oword_ld_unaligned:
+    case GenXIntrinsic::genx_svm_block_ld:
+    case GenXIntrinsic::genx_svm_block_ld_unaligned:
+    case GenXIntrinsic::genx_svm_gather:
+    case GenXIntrinsic::genx_svm_gather4_scaled:
+    case GenXIntrinsic::genx_svm_atomic_add:
+    case GenXIntrinsic::genx_svm_atomic_sub:
+    case GenXIntrinsic::genx_svm_atomic_min:
+    case GenXIntrinsic::genx_svm_atomic_max:
+    case GenXIntrinsic::genx_svm_atomic_xchg:
+    case GenXIntrinsic::genx_svm_atomic_and:
+    case GenXIntrinsic::genx_svm_atomic_or:
+    case GenXIntrinsic::genx_svm_atomic_xor:
+    case GenXIntrinsic::genx_svm_atomic_imin:
+    case GenXIntrinsic::genx_svm_atomic_imax:
+    case GenXIntrinsic::genx_svm_atomic_inc:
+    case GenXIntrinsic::genx_svm_atomic_dec:
+    case GenXIntrinsic::genx_svm_atomic_cmpxchg:
+    case GenXIntrinsic::genx_load:
+    case GenXIntrinsic::genx_sample:
+    case GenXIntrinsic::genx_sample_unorm:
+    case GenXIntrinsic::genx_3d_sample:
+    case GenXIntrinsic::genx_3d_load:
+    case GenXIntrinsic::genx_avs:
+    case GenXIntrinsic::genx_raw_send:
+    case GenXIntrinsic::genx_raw_sends:
+    case GenXIntrinsic::genx_va_convolve2d:
+    case GenXIntrinsic::genx_va_hdc_convolve2d:
+    case GenXIntrinsic::genx_va_erode:
+    case GenXIntrinsic::genx_va_hdc_erode:
+    case GenXIntrinsic::genx_va_dilate:
+    case GenXIntrinsic::genx_va_hdc_dilate:
+    case GenXIntrinsic::genx_va_minmax:
+    case GenXIntrinsic::genx_va_minmax_filter:
+    case GenXIntrinsic::genx_va_hdc_minmax_filter:
+    case GenXIntrinsic::genx_va_bool_centroid:
+    case GenXIntrinsic::genx_va_centroid:
+    case GenXIntrinsic::genx_va_1d_convolve_horizontal:
+    case GenXIntrinsic::genx_va_hdc_1d_convolve_horizontal:
+    case GenXIntrinsic::genx_va_1d_convolve_vertical:
+    case GenXIntrinsic::genx_va_hdc_1d_convolve_vertical:
+    case GenXIntrinsic::genx_va_1pixel_convolve:
+    case GenXIntrinsic::genx_va_hdc_1pixel_convolve:
+    case GenXIntrinsic::genx_va_1pixel_convolve_1x1mode:
+    case GenXIntrinsic::genx_va_lbp_creation:
+    case GenXIntrinsic::genx_va_hdc_lbp_creation:
+    case GenXIntrinsic::genx_va_lbp_correlation:
+    case GenXIntrinsic::genx_va_hdc_lbp_correlation:
+    case GenXIntrinsic::genx_va_correlation_search:
+    case GenXIntrinsic::genx_va_flood_fill:
+      return true;
+    default:
+      break;
+    }
+    break;
+  }
+  return false;
+}
+
+/***********************************************************************
+ * processMainInst : set up baling info for potential main instruction
+ */
+void GenXBaling::processMainInst(Instruction *Inst, int IntrinID)
+{
+  BaleInfo BI(BaleInfo::MAININST);
+  if (IntrinID == Intrinsic::dbg_value)
+    return;
+  if (IntrinID == GenXIntrinsic::not_any_intrinsic) {
+    if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst)
+        && !isa<CastInst>(Inst) && !isa<SelectInst>(Inst))
+      return;
+    if (isa<BitCastInst>(Inst))
+      return;
+    BI.Type = checkModifier(Inst);
+    // Work out whether the instruction accepts arithmetic, logic or no
+    // modifier.
+    int ModType = GenXIntrinsicInfo::MODIFIER_ARITH;
+    switch (BI.Type) {
+      case BaleInfo::NOTMOD:
+        // a "not" can only merge with a logic modifier (another "not")
+        ModType = GenXIntrinsicInfo::MODIFIER_LOGIC;
+        break;
+      case BaleInfo::ZEXT:
+      case BaleInfo::SEXT:
+        // an extend cannot bale in any other modifier.
+        // But as a bodge we allow abs to be baled in to zext/sext. This is a
+        // workaround for not having worked out how to set the computation type
+        // in cm_abs. Currently cm_abs does a genx.absi in the source type, then
+        // converts it to destination type. This does not allow for the result
+        // of an abs needing one more bit than its input.
+        ModType = MODIFIER_ABSONLY;
+        break;
+      case BaleInfo::MAININST:
+        switch (Inst->getOpcode()) {
+          case Instruction::And:
+          case Instruction::Or:
+          case Instruction::Xor:
+            // These instructions take a logic modifier.
+            ModType = GenXIntrinsicInfo::MODIFIER_LOGIC;
+            break;
+          case Instruction::LShr:
+          case Instruction::AShr:
+          case Instruction::Shl:
+            // Do not allow source modifier on integer shift operations,
+            // because of extra precision introduced.
+            ModType = GenXIntrinsicInfo::MODIFIER_DEFAULT;
+            break;
+          default:
+            // All other (non-intrinsic) instructions take an arith modifier.
+            break;
+        }
+        break;
+      default:
+        // Anything else is an arith modifier, so it can only merge with
+        // another arith modifier.
+        break;
+    }
+    unsigned i = 0;
+    if (isa<SelectInst>(Inst)) {
+      // Deal specially with operand 0, the selector, of a select.
+      const unsigned OperandNum = 0;
+      if (processPredicate(Inst, OperandNum))
+        setOperandBaled(Inst, OperandNum, &BI);
+      ++i;
+    }
+    // See which operands we can bale in.
+    for (unsigned e = Inst->getNumOperands(); i != e; ++i)
+      if (operandIsBaled(Inst, i, ModType))
+        setOperandBaled(Inst, i, &BI);
+  } else if (IntrinID == GenXIntrinsic::genx_convert
+      || IntrinID == GenXIntrinsic::genx_convert_addr) {
+    // llvm.genx.convert can bale, and has exactly one arg
+    if (operandIsBaled(Inst, 0, GenXIntrinsicInfo::MODIFIER_ARITH))
+      setOperandBaled(Inst, 0, &BI);
+  } else if (GenXIntrinsic::isAbs(IntrinID)) {
+    BI.Type = BaleInfo::ABSMOD;
+    if (operandIsBaled(Inst, 0, GenXIntrinsicInfo::MODIFIER_ARITH))
+      setOperandBaled(Inst, 0, &BI);
+  } else {
+    // For an intrinsic, check the arg info of each arg to see if we can
+    // bale into it.
+    GenXIntrinsicInfo Info(IntrinID);
+    for (const auto *p = Info.getInstDesc(); *p; ++p) {
+      GenXIntrinsicInfo::ArgInfo AI(*p);
+      if (AI.isArgOrRet() && !AI.isRet()) {
+        unsigned ArgIdx = AI.getArgIdx();
+        switch (AI.getCategory()) {
+          case GenXIntrinsicInfo::GENERAL:
+            // This source operand of the intrinsic is general.
+            if (operandIsBaled(Inst, ArgIdx, AI.getModifier(), AI.Info))
+              setOperandBaled(Inst, ArgIdx, &BI);
+            break;
+          case GenXIntrinsicInfo::RAW:
+            // Rdregion can be baled in to a raw operand as long as it is
+            // unstrided and starts on a GRF boundary. Ensure that the input to
+            // the rdregion is 32 aligned.
+            if (isValueRegionOKForRaw(Inst->getOperand(ArgIdx),
+                                      /*IsWrite=*/false, ST)) {
+              setOperandBaled(Inst, ArgIdx, &BI);
+              if (Liveness) {
+                Value *Opnd = Inst->getOperand(ArgIdx);
+                Opnd = cast<Instruction>(Opnd)->getOperand(0);
+                Liveness->getOrCreateLiveRange(Opnd)->LogAlignment = 5;
+              }
+            }
+            break;
+          case GenXIntrinsicInfo::TWOADDR:
+            if (Kind == BalingKind::BK_CodeGen) {
+              // Record this as a two address send for processing later.
+              TwoAddrSends.push_back(cast<CallInst>(Inst));
+            }
+            break;
+          case GenXIntrinsicInfo::PREDICATION:
+            // See if there is any baling in to the predicate (mask) operand.
+            if (processPredicate(Inst, ArgIdx))
+              setOperandBaled(Inst, ArgIdx, &BI);
+            break;
+        }
+      }
+    }
+  }
+
+  // If this instruction is a modifier, we attempt to simplify it here
+  // (i.e. fold constants), to avoid confusion later in GenXCisaBuilder
+  // if a modifier has a constant operand. Because this pass scans code
+  // forwards, a constant will propagate through a chain of modifiers.
+  if (BI.Type != BaleInfo::MAININST) {
+    Value *Simplified = nullptr;
+    if (BI.Type != BaleInfo::ABSMOD) {
+      const DataLayout &DL = Inst->getModule()->getDataLayout();
+      Simplified = SimplifyInstruction(Inst, SimplifyQuery(DL));
+    } else {
+      // SimplifyInstruction does not work on abs, so we roll our own for now.
+      if (auto C = dyn_cast<Constant>(Inst->getOperand(0))) {
+        if (C->getType()->isIntOrIntVectorTy()) {
+          if (!ConstantExpr::getICmp(CmpInst::ICMP_SLT, C,
+                Constant::getNullValue(C->getType()))->isNullValue())
+            C = ConstantExpr::getNeg(C);
+        } else {
+          if (!ConstantExpr::getFCmp(CmpInst::FCMP_OLT, C,
+                Constant::getNullValue(C->getType()))->isNullValue())
+            C = ConstantExpr::getFNeg(C);
+        }
+        Simplified = C;
+      }
+    }
+    if (Simplified) {
+      assert(isa<Constant>(Simplified) && "expecting a constant when simplifying a modifier");
+      Inst->replaceAllUsesWith(Simplified);
+      Inst->eraseFromParent();
+      return;
+    }
+  }
+
+  // Only give an instruction an entry in the map if (a) it is not a main
+  // instruction or (b) it bales something in.
+  if (BI.Type || BI.Bits)
+    setBaleInfo(Inst, BI);
+}
+
+/***********************************************************************
+ * processBranch : process a branch instruction
+ *
+ * If the branch is conditional, bale in all/any/not
+ */
+void GenXBaling::processBranch(BranchInst *Branch)
+{
+  if (Branch->isConditional()) {
+    BaleInfo BI(BaleInfo::MAININST);
+    if (processPredicate(Branch, 0/*OperandNum of predicate*/)) {
+      setOperandBaled(Branch, 0/*OperandNum*/, &BI);
+      setBaleInfo(Branch, BI);
+    }
+  }
+}
+
+/***********************************************************************
+ * processTwoAddrSend : process a two-address send
+ *
+ * A "two-address send" is a send (or an intrinsic that becomes a send in the
+ * finalizer) with a potentially partial write, so it has a TWOADDR operand to
+ * represent the value of the destination before the operation, and that
+ * TWOADDR operand is not undef.
+ *
+ * This only gets called in the second baling pass.
+ *
+ * We can bale a rdregion into the TWOADDR operand and bale the send into a
+ * wrregion, but only if the two have the same region and "old value" input.
+ *
+ * We used to allow such baling in first baling, such that legalization would
+ * then not split the rdregion and wrregion. In bug 4607, we ran into a problem
+ * where code changed due to vector decomposition, and the same baling did not
+ * happen in second baling, leaving an illegally wide rdregion or wrregion.
+ *
+ * So now we only do this special kind of baling in the second baling pass.
+ * That means that we have to detect where the rdregion and wrregion have been
+ * split by legalization. We use the RdWrRegionSequence class to do that.
+ */
+void GenXBaling::processTwoAddrSend(CallInst *CI)
+{
+  unsigned TwoAddrOperandNum = CI->getNumArgOperands() - 1;
+  assert(GenXIntrinsicInfo(GenXIntrinsic::getAnyIntrinsicID(CI))
+      .getArgInfo(TwoAddrOperandNum)
+      .getCategory() == GenXIntrinsicInfo::TWOADDR);
+  assert(GenXIntrinsicInfo(GenXIntrinsic::getAnyIntrinsicID(CI))
+      .getRetInfo()
+      .getCategory() == GenXIntrinsicInfo::RAW);
+  // First check the case where legalization did not need to split the rdregion
+  // and wrregion.
+  auto TwoAddrOperand = dyn_cast<Instruction>(CI->getArgOperand(TwoAddrOperandNum));
+  if (!TwoAddrOperand)
+    return;
+  if (GenXIntrinsic::isRdRegion(TwoAddrOperand)) {
+    if (!CI->hasOneUse())
+      return;
+    auto Rd = cast<Instruction>(TwoAddrOperand);
+    auto Wr = cast<Instruction>(CI->use_begin()->getUser());
+    if (!GenXIntrinsic::isWrRegion(Wr))
+      return;
+    if (CI->use_begin()->getOperandNo()
+        != GenXIntrinsic::GenXRegion::NewValueOperandNum)
+      return;
+    Region RdR(Rd, BaleInfo());
+    Region WrR(Wr, BaleInfo());
+    if (RdR != WrR || RdR.Indirect || WrR.Mask)
+      return;
+    if (!isValueRegionOKForRaw(Wr, /*IsWrite=*/true, ST))
+      return;
+    // Everything else is in place for a rd-send-wr baling. We just need to check
+    // that the input to the read sequence is the same as the old value input to
+    // the write sequence.  We need to allow for some bitcasts in the way. Having
+    // different bitcasts on the two inputs is ok, as long as the original value
+    // is the same, because bitcasts are always copy coalesced so will be in the
+    // same register.
+    Value *RdIn = Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+    Value *WrIn = Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+    while (auto BC = dyn_cast<BitCastInst>(RdIn))
+      RdIn = BC->getOperand(0);
+    while (auto BC = dyn_cast<BitCastInst>(WrIn))
+      WrIn = BC->getOperand(0);
+    if (RdIn != WrIn)
+      return;
+    // We can do the baling.
+    auto BI = getBaleInfo(CI);
+    setOperandBaled(CI, TwoAddrOperandNum, &BI);
+    setBaleInfo(CI, BI);
+    BI = getBaleInfo(Wr);
+    setOperandBaled(Wr, GenXIntrinsic::GenXRegion::NewValueOperandNum, &BI);
+    setBaleInfo(Wr, BI);
+    return;
+  }
+  // Second, check the case where legalization has split the rdregion and
+  // wrregion.
+  if (CI->use_empty())
+      return;
+  if (!GenXIntrinsic::isWrRegion(TwoAddrOperand))
+    return;
+  RdWrRegionSequence RdSeq;
+  if (!RdSeq.buildFromWr(TwoAddrOperand, this))
+    return;
+  RdWrRegionSequence WrSeq;
+  auto Rd = cast<Instruction>(CI->use_begin()->getUser());
+  if (!GenXIntrinsic::isRdRegion(Rd))
+    return;
+  if (!WrSeq.buildFromRd(Rd, this))
+    return;
+  if (!RdSeq.WrR.isWhole(CI->getType()))
+    return;
+  if (!WrSeq.RdR.isWhole(CI->getType()))
+    return;
+  if (RdSeq.RdR.Indirect || WrSeq.WrR.Indirect)
+    return;
+  if (RdSeq.RdR != WrSeq.WrR)
+    return;
+  // Everything else is in place for a rd-send-wr baling. We just need to check
+  // that the input to the read sequence is the same as the old value input to
+  // the write sequence.  We need to allow for some bitcasts in the way. Having
+  // different bitcasts on the two inputs is ok, as long as the original value
+  // is the same, because bitcasts are always copy coalesced so will be in the
+  // same register.
+  Value *RdIn = RdSeq.Input;
+  Value *WrIn = WrSeq.OldVal;
+  while (auto BC = dyn_cast<BitCastInst>(RdIn))
+    RdIn = BC->getOperand(0);
+  while (auto BC = dyn_cast<BitCastInst>(WrIn))
+    WrIn = BC->getOperand(0);
+  if (RdIn != WrIn)
+    return;
+  // Check that there are no uses of CI other than in WrSeq. We can do that by
+  // counting the uses.
+  unsigned NumUses = 0, Size = WrSeq.size();
+  for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui)
+    if (++NumUses > Size)
+      return;
+  // We can bale, but we need to unlegalize back to a single rdregion and
+  // single wrregion.
+  auto NewRd = RdSeq.RdR.createRdRegion(RdSeq.Input, RdSeq.StartWr->getName(),
+      RdSeq.StartWr, RdSeq.StartWr->getDebugLoc());
+  CI->setOperand(TwoAddrOperandNum, NewRd);
+  auto NewWr = cast<Instruction>(WrSeq.WrR.createWrRegion(WrSeq.OldVal, CI,
+      WrSeq.StartWr->getName(), WrSeq.StartWr, WrSeq.StartWr->getDebugLoc()));
+  WrSeq.EndWr->replaceAllUsesWith(NewWr);
+  // Set baling info for new instructions. The BI for NewWr is just a copy of
+  // the first wrregion in the sequence being replaced.
+  setBaleInfo(NewWr, getBaleInfo(WrSeq.StartWr));
+  auto BI = getBaleInfo(CI);
+  setOperandBaled(CI, TwoAddrOperandNum, &BI);
+  setBaleInfo(CI, BI);
+  // Remove original sequences if now unused.
+  for (Instruction *End = RdSeq.EndWr;;) {
+    for (Instruction *Wr = End; Wr && Wr->use_empty(); ) {
+      if (!Wr->use_empty())
+        break;
+      if (Wr->getNumOperands() < 2)
+        break;
+      auto Rd = dyn_cast<Instruction>(Wr->getOperand(1));
+      auto NextWr = dyn_cast<Instruction>(Wr->getOperand(0));
+      Liveness->eraseLiveRange(Wr);
+      Wr->eraseFromParent();
+      assert(Rd);
+      if (Rd->use_empty()) {
+        Liveness->eraseLiveRange(Rd);
+        Rd->eraseFromParent();
+      }
+      Wr = NextWr;
+    }
+    if (End == WrSeq.EndWr)
+      break;
+    End = WrSeq.EndWr;
+  }
+}
+
+/***********************************************************************
+ * setBaleInfo : set BaleInfo for an instruction
+ */
+void GenXBaling::setBaleInfo(const Instruction *Inst, genx::BaleInfo BI)
+{
+  assert(BI.Bits < 1 << Inst->getNumOperands());
+  InstMap[static_cast<const llvm::Value *>(Inst)] = BI;
+}
+
+/***********************************************************************
+ * setOperandBaled : set flag to say that an operand is baled in
+ *
+ * Enter:   Inst = instruction to bale into
+ *          OperandNum = operand number in that instruction
+ *          BI = BaleInfo to set flag in
+ *
+ * If the operand value has multiple uses, this also flags that we will need
+ * to do some cloning afterwards to ensure that a baled in operand has a
+ * single use.
+ *
+ * Note that a main instruction baled into a saturate modifier or into
+ * a wrregion, or a saturate modifier baled into a wrregion, never has
+ * multiple uses. So the multiple use thing only covers source operands
+ * of the main inst, plus a possible addradd in the wrregion.
+ */
+void GenXBaling::setOperandBaled(Instruction *Inst, unsigned OperandNum,
+    BaleInfo *BI)
+{
+  // Set the bit.
+  BI->Bits |= 1 << OperandNum;
+  // Check whether the operand has more than one use.
+  Instruction *BaledInst = cast<Instruction>(Inst->getOperand(OperandNum));
+  if (!BaledInst->hasOneUse()) {
+    // Multiple uses. Add to the NeedClone stack. But not if it is a goto/join;
+    // we allow a goto/join to be baled into the extract of its !any(EM) result
+    // even though it has uses in other extracts.
+    unsigned IID = GenXIntrinsic::getGenXIntrinsicID(BaledInst);
+    if (IID != GenXIntrinsic::genx_simdcf_goto &&
+        IID != GenXIntrinsic::genx_simdcf_join)
+      NeedCloneStack.push_back(NeedClone(Inst, OperandNum));
+  }
+}
+
+/***********************************************************************
+ * doClones : do any cloning required to make baled in instructions
+ *            single use
+ *
+ * NeedCloneStack is a stack of operands (instruction and operand number
+ * pairs) that are baled in and have more than one use, so need cloning.
+ * They were pushed in forward order, so if A is baled into B is baled
+ * into C then the use of A in B was pushed before the use of B in C.
+ *
+ * We now pop off the stack in reverse order. We see the use of B in C,
+ * and clone B to single use B'. Then we see that B bales in A, so we
+ * add the use of A in B' onto the stack, causing A to be cloned later.
+ * In this way we handle nested baling correctly.
+ */
+void GenXBaling::doClones()
+{
+  while (NeedCloneStack.size()) {
+    // Pop a NeedClone off the stack.
+    NeedClone NC = NeedCloneStack.back();
+    NeedCloneStack.pop_back();
+    // See if it is still multiple use (earlier cloning may have caused this
+    // one to become single use).
+    Instruction *Opnd = cast<Instruction>(NC.Inst->getOperand(NC.OperandNum));
+    if (Opnd->hasOneUse())
+      continue;
+    // See if it is still baled. But continue with cloning even if not baled in
+    // these cases:
+    // 1. An extend (zext or sext), because it tends to result in better gen
+    //    code, probably because a zext or sext can be baled in to its user by
+    //    the finalizer in a case where we cannot because of the vISA
+    //    restriction that both operands need the same extend. This case arises
+    //    only if we were going to bale the extend in, but then decided not to
+    //    because the two operands did not have the same extend.
+    // 2. An address generating instruction, because, at this point in the flow
+    //    (between GenXCategory and GenXAddressCommoning), an address
+    //    generating instruction must have a single use.
+    bool IsBaled = getBaleInfo(NC.Inst).isOperandBaled(NC.OperandNum);
+    if (!IsBaled && !isa<CastInst>(Opnd) &&
+        getAddrOperandNum(GenXIntrinsic::getGenXIntrinsicID(NC.Inst)) != (int)NC.OperandNum)
+      continue;
+    // Clone it.
+    assert(!isa<PHINode>(Opnd));
+    Instruction *Cloned = Opnd->clone();
+    Cloned->setName(Opnd->getName());
+    // Change the use.
+    NC.Inst->setOperand(NC.OperandNum, Cloned);
+    if (IsBaled) {
+      // Normally, insert the cloned instruction just after the original.
+      Cloned->insertAfter(Opnd);
+    } else {
+      // In the special case that we are cloning something even when not baled:
+      // Ensure the cloned instruction has the same category as the original
+      // one.
+      if (Liveness)
+        Liveness->getOrCreateLiveRange(Cloned)->setCategory(
+            Liveness->getOrCreateLiveRange(Opnd)->getCategory());
+      // Insert the clone just before its single use.
+      Cloned->insertBefore(NC.Inst);
+      // If the instruction that we cloned is now single use, not in a phi
+      // node, move it to just before its use.
+      if (Opnd->hasOneUse()) {
+        auto User = Opnd->use_begin()->getUser();
+        if (!isa<PHINode>(User)) {
+          Opnd->removeFromParent();
+          Opnd->insertBefore(cast<Instruction>(User));
+        }
+      }
+    }
+    // Copy the bale info.
+    BaleInfo BI = getBaleInfo(Opnd);
+    setBaleInfo(Cloned, BI);
+    // Stack any operands of the cloned instruction that are baled. (They
+    // must be multiple use because we have just cloned the instruction
+    // using them.) Also any address calculation, for the reason given in the
+    // comment above.
+    int AON = getAddrOperandNum(GenXIntrinsic::getGenXIntrinsicID(Cloned));
+    for (unsigned i = 0, e = Cloned->getNumOperands(); i != e; ++i)
+      if (BI.isOperandBaled(i) ||
+          (Kind == BalingKind::BK_CodeGen && AON == (int)i &&
+           isa<Instruction>(Cloned->getOperand(i))))
+        NeedCloneStack.push_back(NeedClone(Cloned, i));
+  }
+}
+
+/***********************************************************************
+ * getOrUnbaleExtend : get or unbale the extend instruction (if any) in
+ *                     this operand
+ *
+ * Enter:   Inst = instruction containing operand
+ *          BI = BaleInfo for Inst
+ *          OperandNum = operand number to look at
+ *          Unbale = true to unbale the extend
+ *
+ * Return:  0 if no extend found, else the extend (ZExt or SExt), and, if
+ *          Unbale is true, then *BI has been modified _and_ written back
+ *          into Inst's map entry in GenXBaling.
+ *
+ * BI is a pointer to handle two slightly different cases of unbaling the ext:
+ * 1. If this is the top level call to getOrUnBaleExtend from processMainInst,
+ *    then we want to modify the caller's BaleInfo pointed to by BI, which the
+ *    caller is in the middle of setting up and will write back into the map.
+ * 2. If this is a recursive call from getOrUnbaleExtend, then we want to
+ *    use setBaleInfo to write the BaleInfo back into the map.
+ * We don't check which case we have, and we just do both things, as the
+ * unneeded one is harmless.
+ */
+Instruction *GenXBaling::getOrUnbaleExtend(Instruction *Inst, BaleInfo *BI,
+    unsigned OperandNum, bool Unbale)
+{
+  if (!BI->isOperandBaled(OperandNum))
+    return nullptr;
+  auto Opnd = cast<Instruction>(Inst->getOperand(OperandNum));
+  if (isa<ZExtInst>(Opnd) || isa<SExtInst>(Opnd)) {
+    // Found an extend. Unbale it if requested. But do not remove it from the
+    // NeedClone stack; we still clone an extend that is not being baled in on
+    // the basis that the jitter will be able to bale it in because gen allows
+    // mismatched integer operand types.
+    if (Unbale) {
+      BI->clearOperandBaled(OperandNum);
+      setBaleInfo(Inst, *BI);
+    }
+    return Opnd;
+  }
+  BaleInfo ThisBI = getBaleInfo(Opnd);
+  if (ThisBI.isOperandBaled(0))
+    return getOrUnbaleExtend(Opnd, &ThisBI, 0, Unbale);
+  if (ThisBI.isOperandBaled(1))
+    return getOrUnbaleExtend(Opnd, &ThisBI, 1, Unbale);
+  return nullptr;
+}
+
+/***********************************************************************
+ * dump, print : dump the result of the GenXBaling analysis
+ */
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void GenXBaling::dump()
+{
+  print(errs());
+}
+#endif
+
+void GenXBaling::print(raw_ostream &OS)
+{
+  for (InstMap_t::iterator i = InstMap.begin(), e = InstMap.end(); i != e; ++i) {
+    const Instruction *Inst = cast<const Instruction>(i->first);
+    BaleInfo *BI = &i->second;
+    OS << Inst->getName() << ": ";
+    switch (BI->Type) {
+      case BaleInfo::WRREGION: OS << "WRREGION"; break;
+      case BaleInfo::SATURATE: OS << "SATURATE"; break;
+      case BaleInfo::MAININST: OS << "MAININST"; break;
+      case BaleInfo::ABSMOD: OS << "ABSMOD"; break;
+      case BaleInfo::NEGMOD: OS << "NEGMOD"; break;
+      case BaleInfo::NOTMOD: OS << "NOTMOD"; break;
+      case BaleInfo::RDREGION: OS << "RDREGION"; break;
+      default: OS << "??"; break;
+    }
+    for (unsigned OperandNum = 0, e = Inst->getNumOperands();
+        OperandNum != e; ++OperandNum)
+      if (BI->isOperandBaled(OperandNum))
+        OS << " " << OperandNum;
+    OS << "\n";
+  }
+}
+
+/***********************************************************************
+ * getBaleParent : return the instruction baled into, 0 if none
+ */
+Instruction *GenXBaling::getBaleParent(Instruction *Inst)
+{
+  // We can rely on the fact that a baled in instruction always has exactly
+  // one use. The exception is llvm.genx.simdcf.goto/join, which is baled in
+  // to the extractvalue that extracts the !any(EM) value. Rather than check
+  // the intrinsic ID, we check whether the return type is struct.
+  auto use = Inst->use_begin();
+  if (!Inst->hasOneUse()) {
+    if (!isa<StructType>(Inst->getType()))
+      return nullptr;
+    // For an llvm.genx.simdcf.goto/join, the use we want is the extractvalue
+    // that extracts the !any(EM) value from the result struct.
+    for (auto ue = Inst->use_end();; ++use) {
+      if (use == ue)
+        return nullptr;
+      if (!isa<ExtractValueInst>(use->getUser()))
+        return nullptr;
+      if (use->getUser()->getType()->isIntegerTy(1))
+        break;
+    }
+  }
+  Instruction *user = cast<Instruction>(use->getUser());
+  BaleInfo BI = getBaleInfo(user);
+  if (!BI.isOperandBaled(use->getOperandNo()))
+    return nullptr;
+  return cast<Instruction>(use->getUser());
+}
+
+/***********************************************************************
+ * unbale : unbale an instruction from its bale parent
+ */
+void GenXBaling::unbale(Instruction *Inst)
+{
+  if (!Inst->hasOneUse())
+    return;
+  Value::use_iterator use = Inst->use_begin();
+  Instruction *user = cast<Instruction>(use->getUser());
+  BaleInfo BI = getBaleInfo(user);
+  unsigned OperandNum = use->getOperandNo();
+  if (!BI.isOperandBaled(OperandNum))
+    return;
+  BI.clearOperandBaled(OperandNum);
+  setBaleInfo(user, BI);
+}
+
+/***********************************************************************
+ * getBaleHead : return the head of the bale containing Inst
+ */
+Instruction *GenXBaling::getBaleHead(Instruction *Inst)
+{
+  for (;;) {
+    Instruction *Parent = getBaleParent(Inst);
+    if (!Parent)
+      break;
+    Inst = Parent;
+  }
+  return Inst;
+}
+
+/***********************************************************************
+ * buildBale : populate a Bale from the head instruction
+ *
+ * Enter:   Inst = the head instruction
+ *          B = Bale struct, assumed empty
+ *          IncludeAddr = default false, true to include address calculations
+ *                        even when not baled in
+ *
+ * IncludeAddr is used by GenXUnbaling to include the address calculation of
+ * a rdregion in the bale, so it can be considered together when deciding
+ * whether to unbale and move. This works because an address calculation has
+ * exactly one use, until GenXAddressCommoning commons them up later.
+ */
+void GenXBaling::buildBale(Instruction *Inst, Bale *B, bool IncludeAddr) const
+{
+  assert(!B->size());
+  buildBaleSub(Inst, B, IncludeAddr);
+}
+
+void GenXBaling::buildBaleSub(Instruction *Inst, Bale *B, bool IncludeAddr) const
+{
+  BaleInfo BI = getBaleInfo(Inst);
+  B->push_front(BaleInst(Inst, BI));
+
+  if (isa<PHINode>(Inst) ||
+      (isa<CallInst>(Inst) && !cast<CallInst>(Inst)->isInlineAsm() &&
+       !GenXIntrinsic::isAnyNonTrivialIntrinsic(Inst)))
+    return;
+  if (IncludeAddr) {
+    int AddrOperandNum = getAddrOperandNum(GenXIntrinsic::getGenXIntrinsicID(Inst));
+    if (AddrOperandNum >= 0) {
+      // IncludeAddr: pretend that the address calculation is baled in, as long
+      // as it is an instruction.
+      if (auto OpndInst = dyn_cast<Instruction>(Inst->getOperand(AddrOperandNum))) {
+        assert(OpndInst->hasOneUse()); (void)OpndInst;
+        BI.setOperandBaled(AddrOperandNum);
+        B->front().Info = BI;
+      }
+    }
+  }
+
+  assert(BI.Bits < (1 << Inst->getNumOperands()) || Inst->getNumOperands() > 16);
+
+  while (BI.Bits) {
+    unsigned Idx = genx::log2(BI.Bits);
+    BI.Bits &= ~(1 << Idx);
+    if (Instruction *Op = dyn_cast<Instruction>(Inst->getOperand(Idx)))
+      buildBaleSub(Op, B, IncludeAddr);
+  }
+}
+
+/***********************************************************************
+ * getAddrOperandNum : given an intrinsic ID, get the address operand number
+ *
+ * For rdregion/wrregion, it returns the operand number of the index operand.
+ *
+ * For genx_add_addr, it returns 0 (the only operand number)
+ *
+ * In any other case, it returns -1.
+ *
+ * This is used both in buildBale when IncludeAddr is true, and in doClones,
+ * to find the address operand of an instruction.
+ */
+int GenXBaling::getAddrOperandNum(unsigned IID) const
+{
+  switch (IID) {
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf:
+      return GenXIntrinsic::GenXRegion::RdIndexOperandNum;
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+      return GenXIntrinsic::GenXRegion::WrIndexOperandNum;
+    case GenXIntrinsic::genx_add_addr:
+      return 0;
+    default:
+      return -1;
+  }
+}
+
+/***********************************************************************
+ * store : store updated BaleInfo for instruction
+ *
+ * Enter:   BI = BaleInst struct
+ *
+ * This function stores BI.Info as the new BaleInfo for BI.Inst
+ *
+ * It is used by GenXLegalization to unbale.
+ */
+void GenXBaling::store(BaleInst BI)
+{
+  assert(BI.Info.Bits < 1<< BI.Inst->getNumOperands());
+  InstMap[BI.Inst] = BI.Info;
+}
+
+static bool skipTransform(Instruction *DefI, Instruction *UseI) {
+  SmallPtrSet<Instruction *, 8> DInsts;
+  BasicBlock *BB = UseI->getParent();
+
+  // Special case for extracting out of subroutine call.
+  if (isa<ExtractValueInst>(DefI))
+   return true;
+
+  // This is a local optimization only.
+  for (auto U : DefI->users()) {
+    auto UI = dyn_cast<Instruction>(U);
+    if (UI == nullptr || UI->getParent() != BB)
+      return true;
+    if (UI != UseI)
+      DInsts.insert(UI);
+  }
+
+  // If a use is crossing the next region write,
+  // then two regions are live at the same time.
+  // Very likely this increases register pressure
+  // and/or results region copies.
+  //
+  // Scan forward starting from Region write,
+  // check if this hits a write to this region
+  // before some use.
+  //
+  SmallPtrSet<Instruction *, 8> UInsts;
+  bool IsLocal = !UseI->isUsedOutsideOfBlock(BB);
+  if (IsLocal) {
+    for (auto U : UseI->users()) {
+      auto UI = dyn_cast<Instruction>(U);
+      if (UI != nullptr)
+        UInsts.insert(UI);
+    }
+  }
+
+  for (auto I = UseI; I; I = I->getNextNode()) {
+    if (I == &BB->back())
+      break;
+    if (DInsts.empty())
+      break;
+
+    // UInst is local and it is dead now.
+    if (IsLocal && UInsts.empty())
+      break;
+
+    // There is a region write before some use.
+    if (GenXIntrinsic::isWrRegion(I) &&
+        I->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum) == UseI)
+      return true;
+
+    if (DInsts.count(I))
+      DInsts.erase(I);
+    if (UInsts.count(I))
+      UInsts.erase(I);
+  }
+
+  // Not all users are checked which means UseI does not
+  // dominate them, or UseI is local and dead before some uses.
+  return !DInsts.empty();
+}
+
+// Normalize ill-formed gstores.
+// Correct gstore should be in form of:
+// x = gload G
+// w = wrr x, StoreVal
+// gstore w, G
+static void normalizeGStore(StoreInst &SI) {
+  auto LI =
+      new LoadInst(SI.getPointerOperand(), ".gload", true /*volatile*/, &SI);
+  Value *StoreOp = SI.getValueOperand();
+  Region R(StoreOp);
+  auto WrR =
+      R.createWrRegion(LI, StoreOp, ".wrr.gstore", &SI, SI.getDebugLoc());
+  SI.setOperand(0 /*Value operand idx*/, WrR);
+}
+
+// If operand of gstore is phi and all its incoming values
+// form legal values for gstore, then return true.
+// All incoming blocks should have single successor.
+// Otherwise return false.
+static bool canPropagatePhiGStore(StoreInst &SI) {
+  Value *Val = SI.getValueOperand();
+  auto *Phi = dyn_cast<PHINode>(Val);
+  if (!Phi)
+    return false;
+
+  if (!llvm::all_of(Phi->blocks(),
+                    [](BasicBlock *BB) { return BB->getSingleSuccessor(); }))
+    return false;
+
+  Value *StorePtr = SI.getPointerOperand();
+  // This can be weakened, but then new gstores should be normalized too.
+  return llvm::all_of(Phi->incoming_values(), [StorePtr](Use &U) {
+    return isLegalValueForGlobalStore(U, StorePtr);
+  });
+}
+
+// Duplicate gstore in blocks with its legal value operands coming from phi.
+// After that, there will be legal gstores that can be baled later.
+// Old gstore with phi is deleted.
+static void propagatePhiGStore(StoreInst &SI) {
+  auto *Phi = cast<PHINode>(SI.getValueOperand());
+  for (Use &U : Phi->incoming_values()) {
+    auto *NewSI = cast<StoreInst>(SI.clone());
+    auto *WrrInst = cast<Instruction>(U);
+    NewSI->insertBefore(WrrInst->getParent()->getTerminator());
+    NewSI->setOperand(0 /*Value operand idx*/, WrrInst);
+  }
+  SI.eraseFromParent();
+  if (Phi->user_empty())
+    Phi->eraseFromParent();
+}
+
+// Normalize gstores.
+// There are two main cases:
+// 1) gstore of phi, then there will be attempt to hoist gstore to
+// its value, if that will give correct gstores.
+// 2) Otherwise, just ill-formed gstore. Normalize it.
+static void normalizeGStores(Function &F) {
+  SmallVector<StoreInst *, 8> PhiWorklist;
+  SmallVector<StoreInst *, 8> NormalizeWorklist;
+  // Collect phi and ill-formed gloads.
+  for (auto &I : instructions(F)) {
+    auto *SI = dyn_cast<StoreInst>(&I);
+    if (!SI || !isGlobalStore(SI))
+      continue;
+    if (canPropagatePhiGStore(*SI))
+      PhiWorklist.push_back(SI);
+    else if (!isGlobalStoreLegal(SI))
+      NormalizeWorklist.push_back(SI);
+  }
+
+  // Handle everything.
+  for (auto *SI : PhiWorklist)
+    propagatePhiGStore(*SI);
+  for (auto *SI : NormalizeWorklist)
+    normalizeGStore(*SI);
+}
+
+// Cleanup and optimization before do baling on a function.
+bool GenXBaling::prologue(Function *F) {
+  bool Changed = false;
+  auto nextInst = [](BasicBlock &BB, Instruction *I) -> Instruction * {
+    // This looks like an llvm bug. We cannot call getPrevNode
+    // on the first instruction...
+    if (isa<PHINode>(I) || I == &BB.front())
+      return nullptr;
+    return I->getPrevNode();
+  };
+
+  for (auto &BB : F->getBasicBlockList()) {
+    // scan the block backwards.
+    for (auto Inst = &BB.back(); Inst; Inst = nextInst(BB, Inst)) {
+      //
+      // Rewrite
+      // A = B op C
+      // V = wrr(A, R)
+      // E = A op D
+      // into
+      //
+      // A = B op C
+      // V = wrr(A, R)
+      // A' = rrd(V, R)
+      // E = A' op D
+      //
+      if (GenXIntrinsic::isWrRegion(Inst)) {
+        Instruction *V = dyn_cast<Instruction>(
+            Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+
+        // Only process the case with multiple uses.
+        if (!V || V->hasOneUse())
+          continue;
+
+        // Skip if this region write is indirect as
+        // this would result an indirect read.
+        Region R(Inst, BaleInfo());
+        if (R.Indirect)
+          continue;
+
+        // Aggressively apply this transform may increase register pressure.
+        // We detect if there is other region write in between, so that two
+        // outer regions will not be live at the same time.
+        if (skipTransform(V, Inst))
+          continue;
+
+        // Do this transformation.
+        // - Insert a region read right after Inst
+        // - Replace all uses other than Inst with this region read
+        //
+        auto NewV = R.createRdRegion(Inst, "split", Inst, Inst->getDebugLoc(),
+                                     /*AllowScalar*/ !V->getType()->isVectorTy());
+
+        assert(NewV->getType() == V->getType());
+        Inst->moveBefore(NewV);
+        for (auto UI = V->use_begin(); UI != V->use_end(); /*Empty*/) {
+          Use &U = *UI++;
+          if (U.getUser() != Inst)
+            U.set(NewV);
+        }
+        Changed = true;
+      }
+    }
+  }
+
+  // fold bitcast into store/load if any. This allows to bale a g_store instruction
+  // crossing a bitcast.
+  for (auto &BB : F->getBasicBlockList()) {
+    for (auto I = BB.begin(); I != BB.end(); /*empty*/) {
+      Instruction *Inst = &*I++;
+      using namespace llvm::PatternMatch;
+
+      // bitcast (bitcast X to Ty1) to Ty2 ==> bitcast X to Ty2
+      Value *X;
+      if (match(Inst, m_BitCast(m_BitCast(m_Value(X))))) {
+        BitCastInst *NewCI = new BitCastInst(X, Inst->getType(), "", Inst);
+        NewCI->setDebugLoc(Inst->getDebugLoc());
+        NewCI->takeName(Inst);
+        Inst->replaceAllUsesWith(NewCI);
+        if (Liveness)
+          Liveness->eraseLiveRange(Inst);
+        Inst->eraseFromParent();
+        Changed = true;
+        continue;
+      }
+
+      if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst)) {
+        Instruction* NewInst = foldBitCastInst(Inst);
+        if (NewInst) {
+          Changed = true;
+          Inst = NewInst;
+        }
+      }
+
+      // Delete Trivially dead store instructions.
+      if (auto ST = dyn_cast<StoreInst>(Inst)) {
+        Value *Val = ST->getValueOperand();
+        assert(Val);
+        if (auto LI = dyn_cast<LoadInst>(Val)) {
+          Value *Ptr = ST->getPointerOperand();
+          auto GV1 = getUnderlyingGlobalVariable(Ptr);
+          auto GV2 = getUnderlyingGlobalVariable(LI->getPointerOperand());
+          if (GV1 && GV1 == GV2) {
+            ST->eraseFromParent();
+            Changed = true;
+          }
+        }
+      }
+    }
+    for (auto I = BB.rbegin(); I != BB.rend(); /*empty*/) {
+      Instruction *Inst = &*I++;
+      if (isInstructionTriviallyDead(Inst)) {
+        if (Liveness)
+          Liveness->eraseLiveRange(Inst);
+        Inst->eraseFromParent();
+      }
+    }
+  }
+
+  // Result of inline asm with multiple outputs is a
+  // struct. Each element of this struct passed to
+  // it's user with extractelement instruction which
+  // should be baled into it's own WrRegion. genx_convert_addr
+  // intrinsic or global load will be unbaled between these bales
+  // of (extractelement + wrregion). The idea is to
+  // move all of those address conversion before
+  // the inline assembly instruction. Also each extractvalue and WrR which uses
+  // extractvalue's result should be moved closely to inline assembly call,
+  // otherwise baling will force baled instructions
+  // to locate far away from inline asm call
+  // which will lead to live range increasing.
+  for (auto &BB : F->getBasicBlockList()) {
+    for (auto &Inst : BB.getInstList()) {
+      auto CI = dyn_cast<CallInst>(&Inst);
+      if (!CI || !CI->isInlineAsm())
+        continue;
+      // Nothing to do if result is not a struct: no multiple outputs
+      if (!isa<StructType>(CI->getType()))
+        continue;
+      for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) {
+        auto EV = dyn_cast<ExtractValueInst>(ui->getUser());
+        if (!EV)
+          continue;
+        EV->moveAfter(&Inst);
+        // extractelement must be baled into wrregion
+        for (auto User : EV->users()) {
+          Changed = true;
+          if (!GenXIntrinsic::isWrRegion(User))
+            continue;
+          Instruction *WrR = cast<Instruction>(User);
+          Value *Index =
+              WrR->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+          while (GenXIntrinsic::getGenXIntrinsicID(Index) ==
+                 GenXIntrinsic::genx_add_addr)
+            Index = cast<Instruction>(Index)->getOperand(0);
+          Instruction *IndexInst = dyn_cast<Instruction>(Index);
+          if (IndexInst && (GenXIntrinsic::getGenXIntrinsicID(IndexInst) ==
+                            GenXIntrinsic::genx_convert_addr))
+            IndexInst->moveBefore(&Inst);
+          Value *OldVal =
+              WrR->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+          LoadInst *Load = dyn_cast<LoadInst>(OldVal);
+          if (Load && isGlobalLoad(Load))
+            Load->moveBefore(&Inst);
+          WrR->moveAfter(EV);
+        }
+      }
+    }
+  }
+
+  normalizeGStores(*F);
+
+  // Remove Phi node with single incoming value
+  for (auto &BB : F->getBasicBlockList()) {
+    for (BasicBlock::iterator bi = BB.begin(), be = BB.end(); bi != be; ) {
+      Instruction *Inst = &*bi;
+      ++bi;
+      if (auto Phi = dyn_cast<PHINode>(Inst)) {
+        if (Phi->getNumIncomingValues() == 1) {
+          Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
+          Phi->eraseFromParent();
+          Changed = true;
+        }
+      }
+      else {
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+
+/***********************************************************************
+ * Bale::getMainInst : get the main instruction from the bale, 0 if none
+ */
+const BaleInst *Bale::getMainInst() const
+{
+  // From the last instruction (the bale head) backwards, find the first
+  // one that is not wrregion or saturate or addradd. If the head is
+  // wrregion, then skip anything before we reach its value operand.
+  // If the first one we find is rdregion, that does not count as a main
+  // instruction.
+  Value *PossibleMainInst = nullptr;
+  for (const_reverse_iterator i = rbegin(), e = rend(); i != e; ++i) {
+    if (PossibleMainInst && PossibleMainInst != i->Inst)
+      continue;
+    PossibleMainInst = nullptr;
+    switch (i->Info.Type) {
+      case BaleInfo::WRREGION:
+        PossibleMainInst = i->Inst->getOperand(1);
+        break;
+      case BaleInfo::GSTORE:
+        PossibleMainInst = i->Inst->getOperand(0);
+        break;
+      case BaleInfo::SATURATE:
+      case BaleInfo::ADDRADD:
+        break;
+      case BaleInfo::MAININST:
+        return &*i;
+      default:
+        return nullptr;
+    }
+  }
+  return nullptr;
+}
+
+/***********************************************************************
+ * eraseFromParent : do eraseFromParent on all instructions in the bale
+ */
+void Bale::eraseFromParent()
+{
+  // Iterate in reverse as each instruction becomes unused only when its
+  // user in the bale is erased.
+  for (reverse_iterator ri = rbegin(), re = rend(); ri != re; ++ri)
+    ri->Inst->eraseFromParent();
+}
+
+/***********************************************************************
+ * Bale::compare : compare this Bale with another one
+ *
+ * Return:  0 if equivalent
+ *          < 0 if less
+ *          > 0 if more
+ *
+ * Two Bales are equivalent if they compute the same value, that is, they
+ * have the same opcodes in the instructions, the instructions are
+ * baled together in the same way, and the operands coming in from outside
+ * the bale are the same.
+ *
+ * Both bales must have had hash() called on them since being built or
+ * modified in any other way.
+ */
+int Bale::compare(const Bale &Other) const
+{
+  assert(Hash && Other.Hash);
+  if (Hash != Other.Hash)
+    return Hash < Other.Hash ? -1 : 1;
+  if (size() != Other.size())
+    return size() < Other.size() ? -1 : 1;
+  for (unsigned i = 0, e = size(); i != e; ++i) {
+    if (Insts[i].Info.Bits != Other.Insts[i].Info.Bits)
+      return Insts[i].Info.Bits < Other.Insts[i].Info.Bits ? -1 : 1;
+    Instruction *Inst = Insts[i].Inst, *OtherInst = Other.Insts[i].Inst;
+    if (Inst->getOpcode() != OtherInst->getOpcode())
+      return Inst->getOpcode() < OtherInst->getOpcode() ? -1 : 1;
+    unsigned NumOperands = Inst->getNumOperands();
+    if (NumOperands != OtherInst->getNumOperands())
+      return NumOperands < OtherInst->getNumOperands() ? -1 : 1;
+    for (unsigned j = 0; j != NumOperands; ++j) {
+      Value *Opnd = Inst->getOperand(j);
+      if (!Insts[i].Info.isOperandBaled(j)) {
+        if (Opnd != OtherInst->getOperand(j))
+          return Opnd < OtherInst->getOperand(j) ? -1 : 1;
+      } else {
+        // Baled operand. Find which baled instruction it is, and check that
+        // the other bale has its corresponding instruction used in its
+        // corresponding operand.
+        // (We could use a map to find the baled instruction
+        // in an algorithmically less complex way, but there is not likely
+        // to be more than 3 or 4 instructions in the bale so I didn't
+        // bother.)
+        unsigned BaledInst;
+        for (BaledInst = 0; Insts[BaledInst].Inst != Opnd; ++BaledInst) {
+          assert(BaledInst != size());
+        }
+        if (Other.Insts[BaledInst].Inst != OtherInst->getOperand(j))
+          return Other.Insts[BaledInst].Inst < OtherInst->getOperand(j) ? -1 : 1;
+      }
+    }
+  }
+  return 0;
+}
+
+/***********************************************************************
+ * hash_value : get a hash_code for a Bale
+ *
+ * If two Bales are equivalent, they have the same hash_value.
+ *
+ * If two Bales are not equivalent, it is unlikely but possible that
+ * they have the same hash_value.
+ */
+void Bale::hash()
+{
+  Hash = 0;
+  for (auto i = begin(), e = end(); i != e; ++i) {
+    BaleInst BI = *i;
+    Hash = hash_combine(Hash, BI.Info.Bits);
+    Hash = hash_combine(Hash, BI.Inst->getOpcode());
+    for (unsigned j = 0, je = BI.Inst->getNumOperands(); j != je; ++j) {
+      Value *Opnd = BI.Inst->getOperand(j);
+      if (!BI.Info.isOperandBaled(j)) {
+        // Non-baled operand. Hash the operand itself.
+        Hash = hash_combine(Hash, Opnd);
+      } else {
+        // Baled operand. Find which baled instruction it is, and use that
+        // index in the hash. (We could use a map to find the baled instruction
+        // in an algorithmically less complex way, but there is not likely
+        // to be more than 3 or 4 instructions in the bale so I didn't
+        // bother.)
+        Bale::iterator BaledInst;
+        for (BaledInst = begin(); BaledInst->Inst != Opnd; ++BaledInst) {
+          assert(BaledInst != i);
+        }
+        Hash = hash_combine(Hash, BaledInst - begin());
+      }
+    }
+  }
+}
+
+bool Bale::isGStoreBaleLegal() const {
+  assert(isGstoreBale());
+  auto ST = cast<StoreInst>(getHead()->Inst);
+  if (!isGlobalStore(ST))
+    return false;
+  return isGlobalStoreLegal(ST);
+}
+
+/***********************************************************************
+ * Bale debug dump/print
+ */
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Bale::dump() const
+{
+  print(errs());
+}
+#endif
+
+void Bale::print(raw_ostream &OS) const
+{
+  OS << "bale {\n";
+  for (const_iterator i = begin(), e = end(); i != e; ++i) {
+    i->Inst->print(OS);
+    OS << " // {" << i->Info.getTypeString() << "}\n";
+  }
+  OS << "}\n";
+}
+
+const char *BaleInfo::getTypeString() const
+{
+  switch (Type) {
+    case BaleInfo::MAININST: return "maininst";
+    case BaleInfo::WRREGION: return "wrregion";
+    case BaleInfo::SATURATE: return "saturate";
+    case BaleInfo::NOTMOD: return "notmod";
+    case BaleInfo::NEGMOD: return "negmod";
+    case BaleInfo::ABSMOD: return "absmod";
+    case BaleInfo::RDREGION: return "rdregion";
+    case BaleInfo::ADDRADD: return "addradd";
+    case BaleInfo::RDPREDREGION: return "rdpredregion";
+    case BaleInfo::ALLANY: return "allany";
+    case BaleInfo::NOTP: return "notp";
+    case BaleInfo::ZEXT: return "zext";
+    case BaleInfo::SEXT: return "sext";
+    case BaleInfo::WRPREDREGION: return "wrpreregion";
+    case BaleInfo::CMPDST: return "cmpdst";
+    case BaleInfo::GSTORE: return "g_store";
+    case BaleInfo::SHUFFLEPRED: return "shufflepred";
+    case BaleInfo::FADDR: return "faddr";
+    default: return "???";
+  }
+}
+
+bool genx::operator==(const BaleInfo &lhs, const BaleInfo &rhs) {
+  return lhs.Type == rhs.Type && lhs.Bits == rhs.Bits;
+}
+
+bool genx::operator==(const BaleInst &lhs, const BaleInst &rhs) {
+  return lhs.Inst == rhs.Inst && lhs.Info == rhs.Info;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.h
new file mode 100644
index 000000000000..d94bf49653ea
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.h
@@ -0,0 +1,550 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXBaling
+/// ----------
+/// 
+/// GenX instruction baling is the process of determining which LLVM instructions
+/// can be combined into a single vISA instruction. Such a group of LLVM
+/// instructions is known as a *bale*. A bale typically has a *main instruction*
+/// and then optionally modifiers and region instructions on the sources and
+/// the destination of the main instruction. However it is possible to have a
+/// bale with no main instruction, for example just a rdregion, a modifier and
+/// a wrregion.
+///
+/// Bale example
+/// ^^^^^^^^^^^^
+/// 
+/// .. image:: GenXDesign_bale.png
+/// 
+/// This example shows a bale that is pretty much as complicated as you can get in
+/// a single bale. Each small blue box is an LLVM IR instruction, with arrows showing
+/// how each one is used. Other than the *bale head* instruction at the top, an
+/// instruction in a bale has only one use, which is within the bale.
+/// 
+/// The baling pass
+/// ^^^^^^^^^^^^^^^
+///
+/// GenX instruction baling happens in two parts:
+/// 
+/// 1. The GenXBaling pass sets up a map to give each Instruction
+///    a *BaleInfo*, which contains a field giving the role the instruction
+///    plays in its enclosing bale (main instruction, rdregion, etc), and a
+///    bit vector where a bit is set if the corresponding operand of the
+///    instruction is another instruction that is baled in (part of the same
+///    bale).
+/// 
+///    GenXBaling is in fact two slightly different passes run at two different
+///    times:
+/// 
+///    * The GenXFuncBaling pass (a FunctionPass) runs before GenXLegalization,
+///      which uses it but invalidates it as it changes the code. This is known
+///      as *first baling*.
+/// 
+///    * The GenXGroupBaling pass (a FunctionGroupPass) runs after GenXLiveness.
+///      From GenXLiveness, baling information remains valid through to
+///      GenXCisaBuilder, since any code changes made (such as adding
+///      copies where coalescing fails) either do not invalidate the analysis,
+///      or the pass making the change also updates the baling analysis.
+/// 
+///    The GenXBaling pass also detects where an instruction is baled in to
+///    another, but the instruction has other uses too. In this case it clones the
+///    instruction. Thus we end up with any baled in instruction having only
+///    one use (with an exception for goto/join -- see below).
+/// 
+///    Thus the GenXBaling pass is not a pure analysis, as it can modify the
+///    code.
+/// 
+/// 2. Using the map set up by the GenXBaling analysis, several functions are
+///    provided for use by other passes:
+/// 
+///    * getBaleInfo()/setBaleInfo() allow another pass to directly inspect and modify
+///      the baling info for an instruction. The BaleInfo for an instruction gives:
+/// 
+///      - Type, the role of the instruction in the bale (e.g. it is a rdregion);
+///      - a bitmap of which operands are baled into it, together with methods
+///        for getting and setting the bit for a particular operand.
+/// 
+///    * getBaleParent() returns the instruction that the given instruction is
+///      baled into, if any
+/// 
+///    * isBaled() says whether the given instruction is baled into anything
+/// 
+///    * getBaleHead() returns the instruction at the head of the bale that the
+///      given instruction is baled into, which is the same as the given instruction
+///      if it is not baled into anything.
+/// 
+///    * buildBale() takes a head instruction (one for which isBaled is false) and
+///      fills out a Bale struct with a vector of BaleInst structs for all the
+///      instructions in the bale, where each BaleInst contains a pointer to the
+///      instruction and its BaleInfo struct (as in getBaleInfo()/setBaleInfo()).
+/// 
+/// Criteria for baling
+/// ^^^^^^^^^^^^^^^^^^^
+/// 
+/// GenXBaling implements the criteria for baling, i.e. when different LLVM IR
+/// instructions can be combined into the same vISA instruction:
+/// 
+/// * A rdregion with a variable index can bale in an add constant (where the
+///   constant is splatted if vector) that generates the index. In second baling,
+///   the constant add is in fact a ``llvm.genx.add.addr`` intrinsic, because that
+///   is what GenXCategory converted it to.
+/// 
+/// * GenXBaling is where an instruction gets recognized as a modifier, for example
+///   subtract from 0 is a negate modifier. The instruction is left as it is, and
+///   its modifier equivalent (e.g. ``BaleInfo::NEGMOD``) is set up in the
+///   instruction's BaleInfo.
+/// 
+/// * SExt/ZExt are also treated as modifiers, although not always balable. See
+///   below.
+/// 
+/// * A modifier can bale in an rdregion.
+/// 
+/// * A modifier can bale in another modifier in some circumstances.
+/// 
+/// * In particular, SExt/ZExt normally cannot bale in another modifier, but they
+///   are allowed to bale in an abs modifier as a bodge to fix a problem where
+///   the LLVM IR generated for ``cm_abs`` does not properly represent its
+///   semantics. See ``dc93b907 GenXBaling: bodge to work around cm_abs problems``.
+/// 
+/// * A main instruction can bale a modifier or rdregion into each operand in some
+///   circumstances:
+/// 
+///   - Some ALU intrinsics have region requirements, e.g. oword aligned,
+///     contiguous. GenXBaling enforces those requirements by only baling in an
+///     rdregion that satifies them, but only in second baling. First baling does
+///     the baling anyway, as we want GenXLegalization to consider the instructions
+///     as one bale as it might legalize in a way that makes the region legal for
+///     the instruction.
+/// 
+///   - Baling an SExt/ZExt in is how we represent a vISA instruction such as
+///     ``add`` with a result type different to operand type. The two operands can
+///     have different types too in Gen, but vISA insists they are the same (if not
+///     constant). So:
+/// 
+///     1. In first baling, we allow SExt/ZExt from different types to be baled in
+///        to the two operands. This tends to make GenXLegalization legalize them
+///        to the same vector width as the main instruction.
+/// 
+///     2. In second baling, we do not allow SExt/ZExt from different types (or one
+///        SExt/ZExt where the other operand does not have one) to be baled in. This
+///        yields a legal vISA instruction, but having done (1) also allows the
+///        finalizer to fold the extend into the instruction.
+/// 
+///   - A raw operand (of a send or shared function intrinsic) has its own
+///     restrictions -- it can bale in a rdregion, but the region has to be
+///     contiguous and GRF aligned.
+/// 
+///   - There is special case code for where send or a shared function intrinsic
+///     has a ``TWOADDR`` raw operand, one that does not appear as a vISA operand
+///     in its own right but is implicitly the same register as the result. The
+///     twoaddr raw operand can bale in a rdregion (with region contiguous and GRF
+///     aligned) as long as the result can be baled into a wrregion with the same
+///     region parameters and the same "old value" input. This represents where a
+///     send or shared function intrinsic does a predicated partial write, and the
+///     place it does the partial write to is a region in a vISA register.
+/// 
+/// * ``llvm.genx.sat`` represents floating point saturation, and is a modifier that
+///   is different to the other modifiers because it is not a source modifier. A
+///   saturate can bale in a main instruction or modifier or rdregion.
+/// 
+/// * A wrregion can do the following baling:
+///   
+///   - It can bale a main instruction (subject to region restrictions in second
+///     baling), a saturate, a modifier or a rdregion into its "new value" input.
+/// 
+///   - Like rdregion, it can bale a constant add into its index operand.
+/// 
+/// * Anything with a predicate (wrregion, select, send, all/any, some shared
+///   function intrinsics) can bale in a predicate not, and any of those things,
+///   including the not, can bale in an rdpredregion to represent using e.g. an M3
+///   flag to use only part of the predicate. However predicate baling is not
+///   done in first baling, as GenXLegalization does not want to consider the
+///   operations together.
+/// 
+/// * Anything with a scalar i1 condition (select, br) can bale in an all/any.
+/// 
+/// Baling of goto/join into br
+/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+/// 
+/// The goto and join intrinsics have multiple return values, returned in a single
+/// struct. One of the return values is the scalar i1 !any value that is then used
+/// in a conditional branch.
+/// 
+/// In second baling, we want the goto/join, the extractvalue of the !any
+/// result, and the conditional branch to be baled together, so we can generate
+/// a single goto/join instruction.
+/// 
+/// However the struct result of the goto/join has other uses, the extractvalues of
+/// the other results. Thus, in this special case, we have a bale where the
+/// goto/join instruction inside the bale has uses other than the inside-bale use.
+/// This needs special case code to handle in GenXBaling.
+/// 
+/// In the future it may be worth considering a generalization of this idea of a
+/// bale that is not a strict tree of instructions, so that we can use LLVM IR
+/// to model Gen instructions with a general result and a flag result. Currently
+/// we cannot do that, which means:
+/// 
+/// 1. we cannot represent addc properly;
+/// 
+/// 2. we cannot represent any combined arithmetic-and-set-flags instruction,
+///    although that is not too much of a problem as the jitter derives such an
+///    instruction by folding a cmp into an arithmetic instruction.
+/// 
+/// Alignment requirements
+/// ^^^^^^^^^^^^^^^^^^^^^^
+/// 
+/// An additional function of the second baling pass is that, when it bales a
+/// raw result intrinsic into a wrregion, it marks the wrregion's LiveRange as
+/// needing to be 32 aligned, and when it bales a rdregion into a raw operand in
+/// an intrinsic, it marks the rdregion's input's LiveRange as needing to be 32
+/// aligned. GenXCategory sets most alignment requirements, but baling in
+/// a rdregion or baling a main instruction into a wrregion imposes alignment
+/// requirements on the vISA register that the region is read from or written to.
+///
+//===----------------------------------------------------------------------===//
+#ifndef GENXBALING_H
+#define GENXBALING_H
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXRegion.h"
+#include "GenXAlignmentInfo.h"
+#include "GenXSubtarget.h"
+#include "IgnoreRAUWValueMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/Pass.h"
+#include <string>
+
+namespace llvm {
+  class BranchInst;
+  class CallInst;
+  class DebugLoc;
+  class GenXLiveness;
+  class Instruction;
+  class raw_ostream;
+  class Twine;
+  class Value;
+
+namespace genx {
+
+// BaleInfo : bale info for one instruction
+struct BaleInfo {
+  // Type is how this instruction relates to its bale, whether it is a
+  // rdregion, wrregion, modifier, or main instruction.
+  enum { MAININST, WRREGION, SATURATE, NOTMOD, NEGMOD, ABSMOD,
+      RDREGION, ADDRADD, ADDROR, FADDR, RDPREDREGION, ALLANY, NOTP, ZEXT, SEXT,
+      SHUFFLEPRED, WRPREDREGION, WRPREDPREDREGION, CMPDST, GSTORE };
+  uint16_t Type;
+  uint16_t Bits; // bitmap of which operands are baled in
+  BaleInfo(int Type = MAININST, unsigned Bits = 0) : Type(Type), Bits(Bits) {}
+  // isOperandBaled() : read Bits to see if operand is baled
+  bool isOperandBaled(unsigned OperandNum) const { return (Bits >> OperandNum) & 1; }
+  // clearOperandBaled() : clear bit that says that operand is baled
+  void clearOperandBaled(unsigned OperandNum) { Bits &= ~(1 << OperandNum); }
+  // setOperandBaled() : set bit that says that operand is baled
+  void setOperandBaled(unsigned OperandNum) { Bits |= 1 << OperandNum; }
+  // getTypeString : get string for BaleInfo type
+  const char *getTypeString() const;
+};
+
+bool operator==(const BaleInfo &lhs, const BaleInfo &rhs);
+
+// BaleInst : one instruction in a bale
+struct BaleInst {
+  Instruction *Inst;
+  BaleInfo Info;
+  BaleInst(Instruction *Inst, BaleInfo Info) : Inst(Inst), Info(Info) {}
+};
+
+bool operator==(const BaleInst &lhs, const BaleInst &rhs);
+
+// Bale : all the instructions in a bale, filled out by buildBale()
+class Bale {
+  typedef SmallVector<BaleInst, 8> Insts_t;
+  Insts_t Insts;
+  hash_code Hash;
+public:
+  Bale() : Hash(0) {}
+  void clear() { Insts.clear(); Hash = 0; }
+  // push_front : push an instruction onto the "front", i.e. it is baled
+  // into an instruction already in the bale
+  void push_front(BaleInst BI) { Insts.push_back(BI); }
+  BaleInst &front() { return Insts.back(); }
+  // push_back : push an instruction onto the "back", i.e. it is the new
+  // head instruction, and the old head instruction is baled into it.
+  // This does an inefficient insert, but is only used in legalization
+  // when adding a wrregion to a bale that does not already have one.
+  void push_back(BaleInst BI) { Insts.insert(Insts.begin(), BI); }
+  BaleInst &back() { return Insts.front(); }
+  // Forward iterator: gives an instruction before any use of it, with the
+  // head instruction of the bale coming last.
+  typedef Insts_t::reverse_iterator iterator;
+  typedef Insts_t::const_reverse_iterator const_iterator;
+  iterator begin() { return Insts.rbegin(); }
+  iterator end() { return Insts.rend(); }
+  const_iterator begin() const { return Insts.rbegin(); }
+  const_iterator end() const { return Insts.rend(); }
+  unsigned size() const { return Insts.size(); }
+  bool empty() const { return Insts.empty(); }
+  // getIteratorPos : get 0..31 unsigned representing position of
+  // Bale::iterator.
+  unsigned getIteratorPos(iterator i) {
+    assert((unsigned)(i - Insts.rbegin()) < 32);
+    return i - Insts.rbegin();
+  }
+  // Reverse iterator: gives an instruction after any use of it, with the
+  // head instruction of the bale coming first.
+  typedef Insts_t::iterator reverse_iterator;
+  typedef Insts_t::const_iterator const_reverse_iterator;
+  reverse_iterator rbegin() { return Insts.begin(); }
+  const_reverse_iterator rbegin() const { return Insts.begin(); }
+  reverse_iterator rend() { return Insts.end(); }
+  const_reverse_iterator rend() const { return Insts.end(); }
+  // getHead : get head instruction of the bale
+  iterator getHeadIt() { return std::prev(end()); }
+  const_iterator getHeadIt() const { return std::prev(end()); }
+  BaleInst *getHead() { return &*getHeadIt(); }
+  const BaleInst *getHead() const { return &*getHeadIt(); }
+  // getPreHead : returns instruction prior to head instruction
+  // unsafe: if there's no such instruction, behavior is undefined
+  iterator getPreHeadIt() { return std::prev(getHeadIt()); }
+  BaleInst *getPreHead() { return &*getPreHeadIt(); }
+  // If a bale ends with a g_store bale, return the baled instruction prior to
+  // this g_store instruction.
+  iterator getHeadIgnoreGStoreIt() {
+    if (endsWithGStore())
+      return getPreHeadIt();
+    return getHeadIt();
+  }
+  BaleInst *getHeadIgnoreGStore() {
+    return &*getHeadIgnoreGStoreIt();
+  }
+  bool endsWithGStore() const {
+    return !empty() && rbegin()->Info.Type == BaleInfo::GSTORE;
+  }
+  // getMainInst : get 0 else the main inst from the bale
+  const BaleInst *getMainInst() const;
+  // hash : set hash code for bale. Must be called before using comparison
+  // operators.
+  void hash();
+  // Comparison operators. Two bales are equivalent if they compute the same
+  // value, that is, they have the same opcodes in the instructions, the
+  // instructions are baled together in the same way, and the operands coming
+  // in from outside the bale are the same.
+  bool operator==(const Bale &Other) const { return !compare(Other); }
+  bool operator!=(const Bale &Other) const { return compare(Other); }
+  bool operator<(const Bale &Other) const { return compare(Other) < 0; }
+  int compare(const Bale &Other) const;
+  // eraseFromParent : do eraseFromParent on all instructions in the bale
+  void eraseFromParent();
+  // Debug dump/print
+  void dump() const;
+  void print(raw_ostream &OS) const;
+  bool isGstoreBale() const { return endsWithGStore(); }
+  bool isGStoreBaleLegal() const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const Bale &B) {
+  B.print(OS);
+  return OS;
+}
+
+} // end namespace genx
+
+//----------------------------------------------------------------------
+// GenXBaling : the baling information for a Function or FunctionGroup (depending
+// on whether GenXFuncBaling or GenXGroupBaling created it)
+class GenXBaling {
+  BalingKind Kind;
+  typedef llvm::ValueMap<const Value *, genx::BaleInfo,
+          IgnoreRAUWValueMapConfig<const Value *>> InstMap_t;
+  GenXSubtarget *ST;
+  InstMap_t InstMap;
+  struct NeedClone {
+    Instruction *Inst;
+    unsigned OperandNum;
+    NeedClone(Instruction *Inst = 0, unsigned OperandNum = 0)
+        : Inst(Inst), OperandNum(OperandNum) {}
+    bool operator==(const NeedClone &Other) const {
+      return Inst == Other.Inst && OperandNum == Other.OperandNum;
+    }
+  };
+  typedef SmallVector<NeedClone, 8> NeedCloneStack_t;
+  NeedCloneStack_t NeedCloneStack;
+  SmallVector<CallInst *, 4> TwoAddrSends;
+protected:
+  GenXLiveness *Liveness; // only in group baling
+public:
+  genx::AlignmentInfo AlignInfo;
+public:
+  explicit GenXBaling(BalingKind BKind, GenXSubtarget *Subtarget)
+      : Kind(BKind), ST(Subtarget),
+        Liveness(nullptr) {}
+  // clear : clear out the analysis
+  void clear() { InstMap.clear(); }
+  // processFunctionGroup : process all the Functions in a FunctionGroup
+  bool processFunctionGroup(FunctionGroup *FG);
+  // processFunction : process one Function
+  bool processFunction(Function *F);
+  // processInst : recalculate the baling info for an instruction
+  void processInst(Instruction *Inst);
+  // getBaleInfo : get BaleInfo for an instruction
+  genx::BaleInfo getBaleInfo(const Instruction *Inst) const {
+    InstMap_t::const_iterator i = InstMap.find((const llvm::Value *)Inst);
+    return i == InstMap.end() ? genx::BaleInfo() : i->second;
+  }
+  // setBaleInfo : set BaleInfo for an instruction
+  void setBaleInfo(const Instruction *Inst, genx::BaleInfo BI);
+  // isBaled : test whether all uses of an instruction would be baled in to
+  // users
+  bool isBaled(Instruction *Inst) { return getBaleParent(Inst); }
+  // getBaleParent : return the instruction baled into, 0 if none
+  Instruction *getBaleParent(Instruction *Inst);
+  // unbale : unbale an instruction from its bale parent
+  void unbale(Instruction *Inst);
+  // getBaleHead : return the head of the bale containing this instruction
+  Instruction *getBaleHead(Instruction *Inst);
+  // buildBale : build Bale from head instruction. B assumed empty on entry
+  void buildBale(Instruction *Inst, genx::Bale *B, bool IncludeAddr = false) const;
+  // store : store updated BaleInfo for Instruction (used to unbale by
+  // GenXLegalization)
+  void store(genx::BaleInst BI);
+  // getIndexAdd : test whether the specified value is a constant add/sub that
+  //   could be baled in as a variable index offset, but without checking that
+  //   the index is in range
+  static bool getIndexAdd(Value *V, int *Offset);
+  // getIndexOr : test whether the specified value is a constant or that
+  //   could be baled in as a variable index offset, but without checking that
+  //   the index is in range
+  static bool getIndexOr(Value *V, int &Offset);
+  // isBalableIndexAdd : test whether the specified value is a constant
+  // add/sub that could be baled in as a variable index offset
+  static bool isBalableIndexAdd(Value *V);
+  // isBalableIndexOr : test whether the specified value is a constant
+  // or that could be baled in as a variable index offset
+  static bool isBalableIndexOr(Value *V);
+  // isBalableNewValueIntoWrr: check whether the new val operand can
+  // be baled into wrr instruction
+  static bool
+  isBalableNewValueIntoWrr(Value *V, const genx::Region &WrrR,
+                           const GenXSubtarget *ST,
+                           genx::AlignmentInfo *AlignInfo = nullptr,
+                           BalingKind BKind = BalingKind::BK_Legalization);
+
+  static bool isHighCostBaling(uint16_t Type, Instruction *Inst);
+  // Debug dump/print
+  void dump();
+  void print(raw_ostream &OS);
+private:
+  // methods to build the info when running the analysis
+  void processWrPredRegion(Instruction *Inst);
+  void processWrPredPredRegion(Instruction *Inst);
+  void processWrRegion(Instruction *Inst);
+  bool processSelect(Instruction *Inst);
+  void processStore(StoreInst *Inst);
+  bool processShufflePred(Instruction *Inst);
+  bool processPredicate(Instruction *Inst, unsigned OperandNum);
+  void processSat(Instruction *Inst);
+  void processRdRegion(Instruction *Inst);
+  void processInlineAsm(Instruction *Inst);
+  void processExtractValue(ExtractValueInst *EV);
+  void processFuncPointer(PtrToIntInst *Inst);
+  void processMainInst(Instruction *Inst, int IntrinID);
+  // helper func for buildBale
+  void buildBaleSub(Instruction *Inst, genx::Bale *B, bool IncludeAddr) const;
+  void processBranch(BranchInst *Branch);
+  void processTwoAddrSend(CallInst *CI);
+  void setOperandBaled(Instruction *Inst, unsigned OperandNum, genx::BaleInfo *BI);
+  void doClones();
+  Instruction *getOrUnbaleExtend(Instruction *Inst, genx::BaleInfo *BI,
+                                 unsigned OperandNum, bool Unbale);
+  int getAddrOperandNum(unsigned IID) const;
+
+  bool operandIsBaled(Instruction *Inst,
+             unsigned OperandNum, int ModType,
+             unsigned ArgInfoBits);
+
+  static bool
+  isRegionOKForIntrinsic(unsigned ArgInfoBits, const genx::Region &R,
+                         bool CanSplitBale, const GenXSubtarget *ST,
+                         genx::AlignmentInfo *AlignInfo = nullptr,
+                         BalingKind BKind = BalingKind::BK_Legalization);
+
+  // Cleanup and optimization before do baling on a function.
+  bool prologue(llvm::Function *F);
+};
+
+//----------------------------------------------------------------------
+// The GenXFuncBaling analysis pass
+// (used for the first baling just before GenXLegalization)
+class GenXFuncBaling : public FunctionPass, public GenXBaling {
+public:
+  static char ID;
+  explicit GenXFuncBaling(BalingKind Kind = BalingKind::BK_Legalization, GenXSubtarget *ST = nullptr)
+      : FunctionPass(ID), GenXBaling(Kind, ST) {}
+  virtual StringRef getPassName() const {
+    return "GenX instruction baling analysis for a function";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunction(Function &F) {
+    clear();
+    return processFunction(&F);
+  }
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O,
+                                  const std::string &Banner) const {
+    return createGenXPrinterPass(O, Banner);
+  }
+};
+void initializeGenXFuncBalingPass(PassRegistry &);
+
+//----------------------------------------------------------------------
+// The GenXGroupBaling analysis pass
+// (used for the second baling just before GenXLiveRanges)
+class GenXGroupBaling : public FunctionGroupPass, public GenXBaling {
+public:
+  static char ID;
+  explicit GenXGroupBaling(BalingKind Kind = BalingKind::BK_Legalization, GenXSubtarget *ST = nullptr)
+      : FunctionGroupPass(ID), GenXBaling(Kind, ST) {}
+  virtual StringRef getPassName() const {
+    return "GenX instruction baling analysis for a function group";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunctionGroup(FunctionGroup &FG);
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O,
+                                  const std::string &Banner) const {
+    return createGenXGroupPrinterPass(O, Banner);
+  }
+};
+void initializeGenXGroupBalingPass(PassRegistry &);
+
+} // end namespace llvm
+
+#endif // GENXBALING_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXCFSimplification.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCFSimplification.cpp
new file mode 100644
index 000000000000..e962db4dea9c
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCFSimplification.cpp
@@ -0,0 +1,354 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXCFSimplification
+/// --------------------
+///
+/// This is a function pass that simplifies CF as follows:
+///
+/// * Where a conditional branch on "not any(pred)" branches over a single
+///   basic block containing a small number of instructions, and all
+///   instructions are either predicated by pred or are used only in the same
+///   basic block, then change the branch to "branch never" so it gets
+///   removed later.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_CFSIMPLIFICATION"
+
+#include "GenX.h"
+#include "GenXIntrinsics.h"
+#include "GenXModule.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm-c/Core.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// GenXCFSimplification : simplify SIMD CF code
+class GenXCFSimplification : public FunctionPass {
+  static const unsigned Threshold;
+  bool Modified = false;
+  SmallVector<BasicBlock *, 4> BranchedOver;
+public:
+  static char ID;
+  explicit GenXCFSimplification() : FunctionPass(ID) { }
+  virtual StringRef getPassName() const { return "GenX SIMD CF simplification"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunction(Function &F);
+private:
+  bool isBranchedOverBlock(BasicBlock *BB);
+  BasicBlock *processBranchedOverBlock(BasicBlock *BB);
+  bool isPredSubsetOf(Value *Pred1, Value *Pred2, bool Inverted);
+};
+
+// Threshold for removing a simd cf branch. The 9999 setting means it is
+// pretty much always removed when it can be.
+const unsigned GenXCFSimplification::Threshold = 9999;
+
+} // end anonymous namespace
+
+char GenXCFSimplification::ID = 0;
+namespace llvm { void initializeGenXCFSimplificationPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXCFSimplification, "GenXCFSimplification", "GenXCFSimplification", false, false)
+INITIALIZE_PASS_END(GenXCFSimplification, "GenXCFSimplification", "GenXCFSimplification", false, false)
+
+FunctionPass *llvm::createGenXCFSimplificationPass()
+{
+  initializeGenXCFSimplificationPass(*PassRegistry::getPassRegistry());
+  return new GenXCFSimplification();
+}
+
+void GenXCFSimplification::getAnalysisUsage(AnalysisUsage &AU) const
+{
+}
+
+/***********************************************************************
+ * GenXCFSimplification::runOnFunction : process one function to
+ *    simplify SIMD CF
+ */
+bool GenXCFSimplification::runOnFunction(Function &F)
+{
+  LLVM_DEBUG(dbgs() << "GenXCFSimplification::runOnFunction(" << F.getName() << ")\n");
+  Modified = false;
+  // Build a list of simple branched over basic blocks.
+  for (auto fi = F.begin(), fe = F.end(); fi != fe; ++fi) {
+    auto BB = &*fi;
+    if (isBranchedOverBlock(BB)) {
+      LLVM_DEBUG(dbgs() << "is branched over: " << BB->getName() << "\n");
+      BranchedOver.push_back(BB);
+    }
+  }
+  // Process each branched over block.
+  while (!BranchedOver.empty()) {
+    auto BB = BranchedOver.back();
+    BranchedOver.pop_back();
+    BasicBlock *SubsumedInto = processBranchedOverBlock(BB);
+    if (!SubsumedInto)
+      continue;
+    Modified = true;
+    // The joined together block may now be a simple branched over block.
+    if (isBranchedOverBlock(SubsumedInto)) {
+      LLVM_DEBUG(dbgs() << "is branched over: " << SubsumedInto->getName() << "\n");
+      BranchedOver.push_back(SubsumedInto);
+    }
+  }
+  return Modified;
+}
+
+
+/***********************************************************************
+ * isBranchedOverBlock : detect whether a basic block is a simple branched
+ * over block. It must have a single predecessor and a single successor,
+ * and the predecessor must end in a conditional branch whose other
+ * successor is our successor.
+ */
+bool GenXCFSimplification::isBranchedOverBlock(BasicBlock *BB)
+{
+  if (BB->use_empty())
+    return false; // no predecessors
+  if (!BB->hasOneUse())
+    return false; // more than one predecessor
+  auto Term = BB->getTerminator();
+  if (Term->getNumSuccessors() != 1)
+    return false; // not exactly one successor
+  Use *U = &*BB->use_begin();
+  auto PredBr = dyn_cast<BranchInst>(U->getUser());
+  if (!PredBr || !PredBr->isConditional())
+    return false; // predecessor is not conditional branch
+  auto Succ = Term->getSuccessor(0);
+  if (PredBr->getSuccessor(0) == BB) {
+    if (PredBr->getSuccessor(1) != Succ)
+      return false; // other cond branch successor is not our successor
+  } else {
+    if (PredBr->getSuccessor(0) != Succ)
+      return false; // other cond branch successor is not our successor
+  }
+  return true;
+}
+
+/***********************************************************************
+ * processBranchedOverBlock : process a branched over block
+ *
+ * Return:  0 if unchanged, else the basic block that BB has been subsumed into
+ */
+BasicBlock *GenXCFSimplification::processBranchedOverBlock(BasicBlock *BB)
+{
+  LLVM_DEBUG(dbgs() << "processBranchedOverBlock: " << BB->getName() << "\n");
+  // Check that the condition to enter the branched over block is an any
+  // of a predicate.
+  auto PredBr = cast<BranchInst>(BB->use_begin()->getUser());
+  auto Cond = PredBr->getCondition();
+  bool Inverted = false;
+  switch (GenXIntrinsic::getGenXIntrinsicID(Cond)) {
+    case GenXIntrinsic::genx_any:
+      if (PredBr->getSuccessor(0) != BB)
+        return nullptr; // branch is the wrong way round
+      break;
+    case GenXIntrinsic::genx_all:
+      if (PredBr->getSuccessor(1) != BB)
+        return nullptr; // branch is the wrong way round
+      Inverted = true;
+      break;
+    default:
+      return nullptr; // condition not "any" or "all"
+  }
+  Cond = cast<Instruction>(Cond)->getOperand(0);
+  LLVM_DEBUG(dbgs() << "branched over simd cf block: " << BB->getName() << " with Cond " << Cond->getName()
+      << (Inverted ? " (inverted)" : "") << "\n"
+      << "(source line of branch is " << PredBr->getDebugLoc().getLine() << "\n");
+  // Check that each phi node in the successor has incomings related as
+  // follows: the incoming from BB must be a chain of selects or predicated
+  // wrregions where the ultimate original input is the other incoming, and
+  // each predicate must be Cond (inverted if necessary), or a subset of it.
+  // Also count the phi nodes that have different incomings for the two blocks,
+  // and if that goes over the threshold give up.
+  unsigned Count = 0;
+  BasicBlock *Succ = BB->getTerminator()->getSuccessor(0);
+  BasicBlock *Pred = PredBr->getParent();
+  for (auto Inst = &Succ->front(); ; Inst = Inst->getNextNode()) {
+    auto Phi = dyn_cast<PHINode>(Inst);
+    if (!Phi)
+      break;
+    LLVM_DEBUG(dbgs() << "Phi " << *Phi << "\n");
+    Value *V = Phi->getIncomingValueForBlock(BB);
+    Value *Orig = Phi->getIncomingValueForBlock(Pred);
+    LLVM_DEBUG(dbgs() << "V: " << *V << "\n"
+        << "Orig: " << *Orig << "\n");
+    if (V == Orig)
+      continue;
+    // Check for special case that Orig is constant 0 and V is the condition
+    // input to any, thus we know that V is 0 if the branch over is taken.
+    // Thus we can change Pred's incoming to the phi node to match BB's.  Not
+    // doing this can result in the branch over not being removable if it is an
+    // inner if..else..endif.
+    if (auto C = dyn_cast<Constant>(Orig)) {
+      if (C->isNullValue() && V == Cond) {
+        Phi->setIncomingValue(Phi->getBasicBlockIndex(Pred), V);
+        continue;
+      }
+    }
+    // Normal check on for phi node.
+    bool OK = false;
+    for (;;) {
+      LLVM_DEBUG(dbgs() << "  checking " << *V << "\n");
+      if (V == Orig) {
+        OK = true;
+        break;
+      }
+      auto Inst = dyn_cast<Instruction>(V);
+      if (!Inst)
+        break;
+      if (++Count > Threshold) {
+        LLVM_DEBUG(dbgs() << "Over threshold\n");
+        break;
+      }
+      if (isa<SelectInst>(Inst)) {
+        if (!isPredSubsetOf(Inst->getOperand(0), Cond, Inverted))
+          break;
+        V = Inst->getOperand(2);
+        continue;
+      }
+      if (!GenXIntrinsic::isWrRegion(Inst))
+        break;
+      if (!isPredSubsetOf(Inst->getOperand(
+              GenXIntrinsic::GenXRegion::PredicateOperandNum), Cond, Inverted))
+        break;
+      V = Inst->getOperand(0);
+    }
+    if (!OK) {
+      LLVM_DEBUG(dbgs() << "failed\n");
+      return nullptr;
+    }
+    LLVM_DEBUG(dbgs() << "OK\n");
+  }
+  // Check that the block does not contain any calls or intrinsics with
+  // side effects.
+  for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi)
+    if (auto CI = dyn_cast<CallInst>(&*bi)) {
+      if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI)) {
+        LLVM_DEBUG(dbgs() << "contains call\n");
+        return nullptr;
+      }
+      if (!CI->getCalledFunction()->doesNotAccessMemory()) {
+        LLVM_DEBUG(dbgs() << "contains intrinsic with side effect\n");
+        return nullptr;
+      }
+    }
+  // We can now do the transformation.
+  LLVM_DEBUG(dbgs() << "Transforming " << BB->getName() << "\n");
+  // Move instructions from BB into the predecessor.
+  for (;;) {
+    auto Inst = &BB->front();
+    if (Inst) {
+      if (Inst->isTerminator())
+        break;
+      Inst->removeFromParent();
+      Inst->insertBefore(PredBr);
+    }
+  }
+  // In each phi node in the successor, change the incoming for the predecessor
+  // to match the incoming for our BB, and remove the incoming for our BB.
+  // If that would leave only one incoming, then remove the phi node.
+  for (auto Inst = &Succ->front();; ) {
+    auto Phi = dyn_cast<PHINode>(Inst);
+    if (!Phi)
+      break;
+    auto Next = Inst->getNextNode();
+    if (Phi->getNumIncomingValues() == 2) {
+      Value *V = Phi->getIncomingValueForBlock(BB);
+      Phi->replaceAllUsesWith(V);
+      Phi->eraseFromParent();
+      // Having got rid of the phi, it is worth running instruction
+      // simplification on each use. Specifically, this turns the
+      // P3 = (P1 & P2) | (P1 & ~P2) at the endif of an if that
+      // has an else into the simpler P1. Without that, an enclosing if
+      // would never have its branch removed, because the use of the "or"
+      // as a predicate stops us detecting that all predicates are a
+      // subset of the branch condition.
+      // Run instruction simplification on each use, but restart if any
+      // simplification happens as then the use chain changes under our feet.
+      if (auto I = dyn_cast<Instruction>(V)) {
+        bool Restart = true;
+        while (Restart) {
+          Restart = false;
+          for (auto ui = I->use_begin(), ue = I->use_end(); ui != ue; ++ui)
+            if (recursivelySimplifyInstruction(
+                  cast<Instruction>(ui->getUser()))) {
+              Restart = true;
+              break;
+            }
+        }
+      }
+    } else {
+      unsigned PredIdx = Phi->getBasicBlockIndex(Pred);
+      unsigned BBIdx = Phi->getBasicBlockIndex(BB);
+      Phi->setIncomingValue(PredIdx, Phi->getIncomingValue(BBIdx));
+      Phi->removeIncomingValue(BBIdx);
+    }
+    Inst = Next;
+  }
+  // Change the predecessor to have an unconditional branch to the successor.
+  auto NewBr = BranchInst::Create(Succ, PredBr);
+  NewBr->takeName(PredBr);
+  auto CondInst = dyn_cast<Instruction>(PredBr->getCondition());
+  PredBr->eraseFromParent();
+  if (CondInst && CondInst->use_empty())
+    CondInst->eraseFromParent();
+  // Remove the now empty and unreferenced BB.
+  BB->eraseFromParent();
+  // Merge Pred and Succ blocks.
+  MergeBlockIntoPredecessor(Succ);
+  return Pred;
+}
+
+/***********************************************************************
+ * isPredSubsetOf : detect whether Pred1 is a subset of Pred2 (or of ~Pred2
+ *    if Inverted is set)
+ */
+bool GenXCFSimplification::isPredSubsetOf(Value *Pred1, Value *Pred2,
+      bool Inverted)
+{
+  if (Pred1 == Pred2 && !Inverted)
+    return true;
+  auto BO = dyn_cast<BinaryOperator>(Pred1);
+  if (!BO)
+    return false;
+  if (BO->getOpcode() == Instruction::And)
+    return isPredSubsetOf(BO->getOperand(0), Pred2, Inverted)
+      || isPredSubsetOf(BO->getOperand(1), Pred2, Inverted);
+  if (BO->getOpcode() == Instruction::Xor)
+    if (auto C = dyn_cast<Constant>(BO->getOperand(1)))
+      return BO->getOperand(0) == Pred2 && C->isAllOnesValue();
+  return false;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXCategory.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCategory.cpp
new file mode 100644
index 000000000000..2318de0c04a8
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCategory.cpp
@@ -0,0 +1,1060 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXCategory
+/// ------------
+///
+/// This pass performs five functions:
+///
+/// 1. It splits any struct phi into a phi for each element of the struct. This
+///    is done in GenXLowering, but a subsequent pass can re-insert a struct phi so
+///    this pass mops those up.
+///
+/// 2. It resolves each overlapping circular phi value.
+///
+///    LLVM IR does not attach
+///    any importance to the order of phi nodes in any particular basic block.
+///    At the head of a loop, a phi incoming can also be a phi definition in the
+///    same block, and they could be in either order.
+///
+///    However, once we start constructing live ranges in the GenX backend, we
+///    attach importance to the order of the phi nodes, so we need to resolve
+///    any such overlapping circular phi value. Currently we do this by
+///    inserting a copy (actually a bitcast) just after the phi nodes in that
+///    basic block. A future enhancement would be to try and re-order the phi
+///    nodes, and only fall back to copy insertion if there is circularity and
+///    it is impossible to find a correct order, for example when the loop body
+///    swaps two variables over.
+///
+/// 3. It inserts a load for any operand that is constant but not allowed to be.
+///    It also catches any case where constant propagation in EarlyCSE has
+///    caused a non-simple constant to be propagated into the instruction.
+///    See the GenXConstants section above.
+//     (in GenXConstants.cpp)
+///
+/// 4. It determines the register category and increased alignment requirement
+///    (e.g. use as a raw operand) of each value, and stores it by creating a
+///    LiveRange for the value and storing it there. At this stage the LiveRange
+///    does not contain any other information; GenXLiveRanges populates it further
+///    (or erases it if the value turns out to be baled in).
+///
+/// 5. It inserts instructions as required to convert from one register
+///    category to another, where a value has its def and uses not all requiring
+///    the same category.
+///
+/// All this pass inserts is a llvm.genx.convert intrinsic. It does not record
+/// what the categories are. This information is recalculated in GenXLiveness.
+///
+/// The reason for inserting the convert intrinsic calls here, before the final
+/// run of GenXBaling before GenXLiveRanges, is that we want GenXBaling to spot
+/// when a convert intrinsic can be baled with rdregion or wrregion.
+///
+/// For one value (function argument or instruction), the pass looks at the
+/// categories required for the defintion and each use. If there is no address
+/// conversion involved, then it inserts a single conversion if possible (all
+/// uses are the same category), otherwise it inserts a conversion for each use
+/// that requires one.
+///
+/// **IR restriction**: After this pass, a value must have its def and all uses
+/// requiring the same register category.
+///
+/// Address conversion
+/// ^^^^^^^^^^^^^^^^^^
+///
+/// An address conversion is treated slightly differently.
+///
+/// A rdregion/wrregion representing an indirect region has a variable index.
+/// This index is actually an index, whereas the vISA we need to generate for
+/// it uses an address register that has been set up with an ``add_addr``
+/// instruction from the index and the base register.
+///
+/// This pass inserts an ``llvm.genx.convert.addr`` intrinsic, with zero offset,
+/// to represent the conversion from index to address register. However, the
+/// intrinsic has no way of representing the base register.  Instead, the base
+/// register is implicitly the "old value" input of the rdregion/wrregion where
+/// the address is used.
+///
+/// The same index may well be used in multiple rdregions and wrregions,
+/// especially after LLVM's CSE. But at this stage we have no idea whether
+/// these multiple rdregions/wrregions will have the same base register, so
+/// we must assume not and insert a separate ``llvm.genx.convert.addr``
+/// for each rdregion/wrregion use of the index.
+///
+/// These multiple address conversions of the same index are commoned up
+/// where possible later on in GenXAddressCommoning. That pass runs after
+/// GenXCoalescing, so it can tell whether two address conversions of the
+/// same index also have the same base register because the "old value"
+/// inputs of the regions have been coalesced together.
+///
+/// Where an index used in an indirect region is a constant add, this pass
+/// inserts the ``llvm.genx.convert.addr`` before that, and turns the constant
+/// add into ``llvm.genx.add.addr``. The latter can be baled into rdregion
+/// or wrregion, representing a constant offset in the indirect region.
+/// Only one ``llvm.genx.add.addr`` is allowed between the
+/// ``llvm.genx.convert.addr`` and the use in a rdregion/wrregion.
+///
+/// However this pass does not check whether the offset is in range (although
+/// GenXBaling does check that before deciding to bale it in). The
+/// GenXAddressCommoning pass sorts that out.
+///
+/// **IR restriction**: After this pass, a variable index in a rdregion/wrregion
+/// must be the result of ``llvm.genx.convert.addr`` or ``llvm.genx.add.addr``.
+/// Operand 0 of ``llvm.genx.add.addr`` must be the result of
+/// ``llvm.genx.convert.addr``.
+///
+/// **IR restriction**: After this pass, up to GenXAddressCommoning, the result
+/// of ``llvm.genx.convert.addr`` must have a single use in either a
+/// ``llvm.genx.add.addr`` or as the index in rdregion/wrregion. The result
+/// of ``llvm.genx.add.addr`` must have a single use as the index in
+/// rdregion/wrregion.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_CATEGORY"
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXConstants.h"
+#include "GenXIntrinsics.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "vc/GenXOpts/Utils/KernelInfo.h"
+#include "vc/GenXOpts/Utils/RegCategory.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvmWrapper/IR/InstrTypes.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+  // CategoryAndAlignment : values returned from getCategoryAndAlignment*
+  // functions
+  struct CategoryAndAlignment {
+    unsigned Cat;
+    unsigned Align;
+    CategoryAndAlignment(unsigned Cat, unsigned Align = 0) : Cat(Cat), Align(Align) {}
+  };
+
+  class UsesCatInfo;
+
+  // GenX category pass
+  class GenXCategory : public FunctionGroupPass {
+    Function *Func;
+    KernelMetadata KM;
+    GenXLiveness *Liveness;
+    DominatorTreeGroupWrapperPass *DTs;
+    SmallVector<Instruction *, 8> ToErase;
+    bool Modified;
+    // Vector of arguments and phi nodes that did not get a category.
+    SmallVector<Value *, 8> NoCategory;
+    bool InFGHead;
+  public:
+    static char ID;
+    explicit GenXCategory() : FunctionGroupPass(ID) { }
+    virtual StringRef getPassName() const { return "GenX category conversion"; }
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+    bool runOnFunctionGroup(FunctionGroup &FG);
+    // createPrinterPass : get a pass to print the IR, together with the GenX
+    // specific analyses
+    virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const
+    { return createGenXGroupPrinterPass(O, Banner); }
+    unsigned getCategoryForPhiIncomings(PHINode *Phi) const;
+    unsigned getCategoryForCallArg(Function *Callee, unsigned ArgNo) const;
+    unsigned getCategoryForInlasmConstraintedOp(CallInst *CI, unsigned ArgNo,
+                                                bool IsOutput) const;
+    CategoryAndAlignment getCategoryAndAlignmentForDef(Value *V) const;
+    CategoryAndAlignment getCategoryAndAlignmentForUse(Value::use_iterator U) const;
+  private:
+    const GenXSubtarget *Subtarget;
+    using ConvListT = std::array<llvm::Instruction *, RegCategory::NUMCATEGORIES>;
+
+    bool processFunction(Function *F);
+    bool fixCircularPhis(Function *F);
+    bool processValue(Value *V);
+    Instruction *createConversion(Value *V, unsigned Cat);
+    ConvListT buildConversions(Value *Def, CategoryAndAlignment DefInfo, const UsesCatInfo &UsesInfo);
+  };
+
+  // AUse : an address use of a value in processValue()
+  struct AUse {
+    Instruction *user;
+    unsigned OperandNum;
+    unsigned Cat;
+    AUse(Value::use_iterator U, unsigned Cat)
+      : user(cast<Instruction>(U->getUser())),
+        OperandNum(U->getOperandNo()), Cat(Cat) {}
+  };
+
+  // almost real input iterator, minimum for range for was implemented
+  class Iterator final {
+    unsigned ShiftedMask_;
+    unsigned CurCat_;
+
+  public:
+    Iterator(unsigned Mask, unsigned Cat) : ShiftedMask_(Mask), CurCat_(Cat) {
+      validate();
+    }
+
+    unsigned operator*() const {
+      validate();
+      return CurCat_;
+    }
+
+    Iterator &operator++() {
+      validate();
+      ShiftedMask_ /= 2;
+      ++CurCat_;
+      if (ShiftedMask_ == 0) {
+        CurCat_ = RegCategory::NUMCATEGORIES;
+        validate();
+        return *this;
+      }
+      for (; ShiftedMask_ % 2 == 0; ShiftedMask_ /= 2, ++CurCat_)
+        ;
+      validate();
+      return *this;
+    }
+
+    friend bool operator==(const Iterator &lhs, const Iterator &rhs) {
+      return (lhs.ShiftedMask_ == rhs.ShiftedMask_ &&
+              lhs.CurCat_ == rhs.CurCat_);
+    }
+
+    friend bool operator!=(const Iterator &lhs, const Iterator &rhs) {
+      return !(lhs == rhs);
+    }
+
+  private:
+    void validate() const {
+      assert((ShiftedMask_ % 2 == 1 || CurCat_ == RegCategory::NUMCATEGORIES) &&
+             "invalid state");
+    }
+  };
+
+  // Implements only begin() and end()
+  // to iterate over categories of uses.
+  class Categories final {
+    unsigned Mask_;
+
+  public:
+    explicit Categories(unsigned Mask) : Mask_(Mask) {}
+
+    Iterator begin() const {
+      // we have no category
+      if (!Mask_)
+        return end();
+      // we have NONE category
+      if (Mask_ % 2 == 1)
+        return Iterator(Mask_, 0);
+      // we adding NONE category
+      Iterator FalseBegin(Mask_ + 1, 0);
+      // and now we get the real first category
+      return ++FalseBegin;
+    }
+
+    Iterator end() const { return Iterator(0, RegCategory::NUMCATEGORIES); }
+  };
+
+  // Encapsulates Category'n'Alignment analysis of value uses.
+  class UsesCatInfo final {
+    using UsesT = llvm::SmallVector<AUse, 8>;
+    UsesT Uses_;
+    unsigned Mask_;
+    unsigned MaxAlign_;
+    unsigned MostUsedCat_;
+
+  public:
+    UsesCatInfo() : Uses_(), Mask_(0), MaxAlign_(0) {}
+
+    UsesCatInfo(const GenXCategory &PassInfo, Value *V) : UsesCatInfo() {
+      std::array<int, RegCategory::NUMCATEGORIES> Stat = {0};
+      for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) {
+        auto CatAlign = PassInfo.getCategoryAndAlignmentForUse(ui);
+        MaxAlign_ = std::max(MaxAlign_, CatAlign.Align);
+        Uses_.push_back(AUse(ui, CatAlign.Cat));
+        Mask_ |= 1 << CatAlign.Cat;
+        if (CatAlign.Cat != RegCategory::NONE)
+          ++Stat[CatAlign.Cat];
+      }
+      auto MaxInStatIt = std::max_element(Stat.begin(), Stat.end());
+      MostUsedCat_ = MaxInStatIt - Stat.begin();
+    }
+
+    bool empty() const { return !Mask_; }
+
+    bool allHaveCat(unsigned cat) const { return !(Mask_ & ~(1 << cat)); }
+
+    const UsesT &getUses() const { return Uses_; }
+
+    unsigned getMaxAlign() const { return MaxAlign_; }
+
+    // When there's no real category uses (real is anything but NONE)
+    // behavior is undefined.
+    unsigned getMostUsedCat() const {
+      assert(!empty() && !allHaveCat(RegCategory::NONE) &&
+             "works only for cases when there are uses with real categories");
+      return MostUsedCat_;
+    }
+
+    // meant to be used in range for
+    Categories getCategories() const { return Categories(Mask_); }
+  };
+
+  void placeConvAfterDef(Function *Func, Instruction *Conv, Value *Def) {
+    if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
+      // Original value is an instruction. Insert just after it.
+      Conv->insertAfter(Inst);
+      Conv->setDebugLoc(Inst->getDebugLoc());
+    } else {
+      assert(isa<Argument>(Def) && "must be an argument if not an instruction");
+      // Original value is a function argument. Insert at the start of the
+      // function.
+      Conv->insertBefore(&*Func->begin()->begin());
+    }
+  }
+
+  void placeConvBeforeUse(Instruction *Conv, Instruction *Use,
+                          unsigned UseOperand) {
+    if (auto PhiUse = dyn_cast<PHINode>(Use)) {
+      // Use is in a phi node. Insert before terminator in corresponding
+      // incoming block.
+      Conv->insertBefore(PhiUse->getIncomingBlock(UseOperand)->getTerminator());
+    } else {
+      // Insert just before use.
+      Conv->insertBefore(Use);
+      Conv->setDebugLoc(Use->getDebugLoc());
+    }
+  }
+
+  } // end anonymous namespace
+
+char GenXCategory::ID = 0;
+namespace llvm { void initializeGenXCategoryPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXCategory, "GenXCategory", "GenXCategory", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_END(GenXCategory, "GenXCategory", "GenXCategory", false, false)
+
+FunctionGroupPass *llvm::createGenXCategoryPass()
+{
+  initializeGenXCategoryPass(*PassRegistry::getPassRegistry());
+  return new GenXCategory();
+}
+
+void GenXCategory::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  FunctionGroupPass::getAnalysisUsage(AU);
+  AU.addRequired<DominatorTreeGroupWrapperPass>();
+  AU.addRequired<GenXLiveness>();
+  AU.addPreserved<GenXModule>();
+  AU.addPreserved<GenXLiveness>();
+  AU.addPreserved<FunctionGroupAnalysis>();
+  AU.addPreserved<DominatorTreeGroupWrapperPass>();
+  AU.setPreservesCFG();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run the category conversion pass for
+ *      this FunctionGroup
+ */
+bool GenXCategory::runOnFunctionGroup(FunctionGroup &FG)
+{
+  KM = KernelMetadata(FG.getHead());
+  DTs = &getAnalysis<DominatorTreeGroupWrapperPass>();
+  Liveness = &getAnalysis<GenXLiveness>();
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  Subtarget = P ? P->getSubtarget() : nullptr;
+  bool Modified = false;
+  if (KM.isKernel()) {
+    // Get the offset of each kernel arg.
+    for (auto ai = FG.getHead()->arg_begin(), ae = FG.getHead()->arg_end();
+        ai != ae; ++ai) {
+      Argument *Arg = &*ai;
+      Liveness->getOrCreateLiveRange(Arg)->Offset = KM.getArgOffset(Arg->getArgNo());
+    }
+  }
+  // Mop up any struct phis, splitting into elements.
+  for (auto i = FG.begin(), e = FG.end(); i != e; ++i)
+    Modified |= splitStructPhis(*i);
+  // Do category conversion on each function in the group.
+  InFGHead = true;
+  for (auto i = FG.begin(), e = FG.end(); i != e; ++i) {
+    Modified |= processFunction(*i);
+    InFGHead = false;
+  }
+  // Now iteratively process values that did not get a category. A valid
+  // category will eventually propagate through a web of phi nodes
+  // and/or subroutine args.
+  while (NoCategory.size()) {
+    SmallVector<Value *, 8> NoCategory2;
+    for (unsigned i = 0, e = NoCategory.size(); i != e; ++i) {
+      if (!processValue(NoCategory[i]))
+        NoCategory2.push_back(NoCategory[i]);
+    }
+    assert(NoCategory2.size() < NoCategory.size() && "not making any progess");
+    NoCategory.clear();
+    if (!NoCategory2.size())
+      break;
+    for (unsigned i = 0, e = NoCategory2.size(); i != e; ++i) {
+      if (!processValue(NoCategory2[i]))
+        NoCategory.push_back(NoCategory2[i]);
+    }
+    Modified |= true;
+  }
+  return Modified;
+}
+
+// Common up constpred calls within a block.
+static bool commonUpPredicate(BasicBlock *BB) {
+  bool Changed = false;
+  // Map from flatten predicate value to its constpred calls.
+  SmallDenseMap<uint64_t, SmallVector<Instruction *, 8>> ValMap;
+
+  for (auto &Inst : BB->getInstList()) {
+    if (GenXIntrinsic::getGenXIntrinsicID(&Inst) == GenXIntrinsic::genx_constantpred) {
+      Constant *V = cast<Constant>(Inst.getOperand(0));
+      if (auto VT = dyn_cast<VectorType>(V->getType())) {
+        unsigned NElts = VT->getVectorNumElements();
+        if (NElts > 64)
+          continue;
+        uint64_t Bits = 0;
+        for (unsigned i = 0; i != NElts; ++i)
+          if (!V->getAggregateElement(i)->isNullValue())
+            Bits |= ((uint64_t)1 << i);
+        auto Iter = ValMap.find(Bits);
+        if (Iter == ValMap.end())
+          ValMap[Bits].push_back(&Inst);
+        else if (Inst.hasOneUse() && Inst.user_back()->getParent() == BB)
+          // Just in case constpred is not from constant predicate loading. This
+          // ensures the first instruction dominates others in the same vector.
+          (Iter->second).push_back(&Inst);
+      }
+    }
+  }
+
+  // Common up when there are more than 2 uses, in which case it will not be
+  // worse than flag spills.
+  for (auto I = ValMap.begin(), E = ValMap.end(); I != E; ++I) {
+    auto &V = I->second;
+    int n = (int)V.size();
+    if (n > 2) {
+      Instruction *DomInst = V.front();
+      for (int i = 1; i < n; ++i) {
+        V[i]->replaceAllUsesWith(DomInst);
+        V[i]->eraseFromParent();
+      }
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/***********************************************************************
+ * processFunction : run the category conversion pass for this Function
+ *
+ * This does a postordered depth first traversal of the CFG,
+ * processing instructions within a basic block in reverse, to
+ * ensure that we see a def after its uses (ignoring phi node uses).
+ * This is specifically useful for an address conversion, where we want to
+ * see the constant add used in an indirect region (and convert it into a
+ * llvm.genx.add.addr) before we see the instruction it uses.
+ */
+bool GenXCategory::processFunction(Function *F)
+{
+  Func = F;
+  // Before doing the category conversion, fix circular phis.
+  Modified = fixCircularPhis(F);
+  // Load constants in phi nodes.
+  loadPhiConstants(F, DTs->getDomTree(F), false, Subtarget);
+  // Process all instructions.
+  for (po_iterator<BasicBlock *> i = po_begin(&Func->getEntryBlock()),
+      e = po_end(&Func->getEntryBlock()); i != e; ++i) {
+    // This loop scans the basic block backwards. If any code is inserted
+    // before the current point, that code is scanned too.
+    BasicBlock *BB = *i;
+    for (Instruction *Inst = &BB->back(); Inst;
+        Inst = (Inst == &BB->front() ? nullptr : Inst->getPrevNode())) {
+      Modified |= loadNonSimpleConstants(Inst, nullptr, Subtarget);
+      Modified |= loadConstants(Inst, Subtarget);
+      if (!processValue(Inst))
+        NoCategory.push_back(Inst);
+    }
+
+    // This commons up constpred calls just loaded.
+    Modified |= commonUpPredicate(BB);
+
+    // Erase instructions (and their live ranges) as requested by processValue.
+    for (unsigned i = 0, e = ToErase.size(); i != e; ++i) {
+      Liveness->eraseLiveRange(ToErase[i]);
+      ToErase[i]->eraseFromParent();
+    }
+    ToErase.clear();
+  }
+  // Process all args.
+  for (auto fi = Func->arg_begin(), fe = Func->arg_end(); fi != fe; ++fi) {
+    Value *V = &*fi;
+    if (!processValue(V))
+      NoCategory.push_back(V);
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * fixCircularPhis : fix up overlapping circular phi nodes
+ *
+ * A phi node at the head of a loop can have a use in the phi nodes in the same
+ * basic block. If the use is after the def, it still refers to the value in
+ * the previous loop iteration, but the GenX backend cannot cope with the
+ * live range going round the loop and overlapping with its own start.
+ *
+ * This function spots any such phi node and works around it by inserting an
+ * extra copy (bitcast) just after the phi nodes in the basic block.
+ *
+ * A better solution for the future would be to re-order the phi nodes if
+ * possible, and only fall back to inserting a copy if there is circularity
+ * (e.g. a loop that swaps two variables in its body).
+ */
+bool GenXCategory::fixCircularPhis(Function *F)
+{
+  bool Modified = false;
+  for (auto fi = Func->begin(), fe = Func->end(); fi != fe; ++fi) {
+    BasicBlock *BB = &*fi;
+    // Process phi nodes in one basic block.
+    for (auto bi = BB->begin(); ; ++bi) {
+      auto Phi = dyn_cast<PHINode>(&*bi);
+      if (!Phi)
+        break; // end of phi nodes
+      if (!GenXLiveness::wrapsAround(Phi, Phi))
+        continue;
+      // Overlapping circular phi node. Insert a copy.
+      // Note that the copy has to be split in the same way as a copy
+      // inserted in GenXCoalescing when coalescing fails, but we have
+      // our own code here because at this point we do not have any real
+      // and possibly coalesced live ranges like GenXCoalescing does.
+      Modified = true;
+      SmallVector<Use *, 8> Uses;
+      for (auto ui = Phi->use_begin(), ue = Phi->use_end(); ui != ue; ++ui)
+        Uses.push_back(&*ui);
+      // A phi node is never a struct -- GenXLowering removed struct phis.
+      assert(!isa<StructType>(Phi->getType()));
+      // Insert a copy, split as required to be legal.
+      auto NewCopy = Liveness->insertCopy(Phi, nullptr, BB->getFirstNonPHI(),
+          Phi->getName() + ".unoverlapper", 0);
+      // Change the uses that existed before we added the copy to use the
+      // copy instead.
+      for (auto ui = Uses.begin(), ue = Uses.end(); ui != ue; ++ui)
+        **ui = NewCopy;
+    }
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * processValue : category conversion for one value
+ *
+ * Return:  whether category successfully chosen
+ *
+ * This returns false only for a function argument or a phi node where all
+ * uses are in phi nodes which themselves do not have a category yet.
+ */
+bool GenXCategory::processValue(Value *V)
+{
+  // Check for special cases.
+  // Ignore void.
+  if (V->getType()->isVoidTy())
+    return true;
+  // Ignore i1 or vector of i1. Predicates do not use category
+  // conversion.
+  if (V->getType()->getScalarType()->isIntegerTy(1))
+    return true;
+  // Elements of a struct always have default (general or predicate) category.
+  if (isa<StructType>(V->getType()))
+    return true;
+
+  auto DefInfo = getCategoryAndAlignmentForDef(V);
+  UsesCatInfo UsesInfo(*this, V);
+
+  // more corner cases
+  if (UsesInfo.empty()) {
+    // Value not used: set its category and then ignore it. If the definition
+    // did not give us a category (probably an unused function arg), then
+    // arbitrarily make it general.
+    if (DefInfo.Cat == RegCategory::NONE)
+      Liveness->getOrCreateLiveRange(V, RegCategory::GENERAL, DefInfo.Align);
+    else
+      Liveness->getOrCreateLiveRange(V, DefInfo.Cat, DefInfo.Align);
+    return true;
+  }
+  else if (UsesInfo.allHaveCat(RegCategory::NONE))
+  {
+    if (DefInfo.Cat == RegCategory::NONE) {
+      // The "no categories at all" case can only happen for a value that is
+      // defined by a function argument or a phi node and used only in phi
+      // nodes or subroutine call args.
+      assert((isa<Argument>(V) || isa<PHINode>(V)) && "no register category");
+      return false;
+    }
+    // Value defined with a category but only used in phi nodes.
+    Liveness->getOrCreateLiveRange(V, DefInfo.Cat, DefInfo.Align);
+    return true;
+  }
+
+  // main case
+  if (DefInfo.Cat == RegCategory::NONE) {
+    // NONE means that we're free to choose the category
+    if (isa<PHINode>(V))
+      // currently we'd like to propogate general through phi
+      DefInfo.Cat = RegCategory::GENERAL;
+    else
+      DefInfo.Cat = UsesInfo.getMostUsedCat();
+  }
+
+  Liveness->getOrCreateLiveRange(V, DefInfo.Cat, std::max(DefInfo.Align, UsesInfo.getMaxAlign()));
+  auto Convs = buildConversions(V, DefInfo, UsesInfo);
+  for (auto UseInfo : UsesInfo.getUses()) {
+    if (UseInfo.Cat != DefInfo.Cat && UseInfo.Cat != RegCategory::NONE) {
+      Instruction *Conv;
+      if (UseInfo.Cat == RegCategory::ADDRESS) {
+        // Case of address category requires a separate conversion for each use, at least until we
+        // get to GenXAddressCommoning where we decide whether we can common some of them up.
+        Conv = createConversion(V, UseInfo.Cat);
+        placeConvBeforeUse(Conv, UseInfo.user, UseInfo.OperandNum);
+        Liveness->getOrCreateLiveRange(Conv)->setCategory(UseInfo.Cat);
+      }
+      else
+        Conv = Convs[UseInfo.Cat];
+      assert(Conv && "must have such conversion");
+      UseInfo.user->setOperand(UseInfo.OperandNum, Conv);
+    }
+  }
+  // If V is now unused (which happens if it is a constant add and all its
+  // uses were addresses), then remove it.
+  if (V->use_empty())
+    ToErase.push_back(cast<Instruction>(V));
+  return true;
+}
+
+/***********************************************************************
+ * createConversion : create call to llvm.genx.convert intrinsic to represent
+ *                    register category conversion
+ *
+ * The new instruction is not inserted anywhere yet.
+ *
+ * In the case that we are asked to convert a use of an add or constant sub
+ * to an address, we instead create an llvm.genx.add.addr of the input
+ * to the add/sub.
+ */
+Instruction *GenXCategory::createConversion(Value *V, unsigned Cat)
+{
+  assert(V->getType()->getScalarType()->isIntegerTy() && "createConversion expects int type");
+  if (Cat == RegCategory::ADDRESS) {
+    Value *Input = V;
+    int Offset = 0;
+    for (;;) {
+      // Check for use of add/sub that can be baled in to a region as a
+      // constant offset. This also handles a chain of two or more adds.
+      int ThisOffset;
+      if (!GenXBaling::getIndexAdd(Input, &ThisOffset) &&
+          !GenXBaling::getIndexOr(Input, ThisOffset))
+        break;
+      if (ThisOffset < G4_MIN_ADDR_IMM)
+        break;
+      Offset += ThisOffset;
+      Input = cast<Instruction>(Input)->getOperand(0);
+    }
+    if (Input != V) {
+      // Turn the add/sub into llvm.genx.add.addr. This could be out of range as
+      // a constant offset in an indirect operand at this stage;
+      // GenXAddressCommoning sorts that out by adjusting the constant offset in
+      // the llvm.genx.convert.addr.
+      return createAddAddr(Input, ConstantInt::get(V->getType(), Offset),
+          V->getName() + ".addradd", nullptr, Func->getParent());
+    }
+  }
+  // Normal conversion. If the source is an integer creation intrinsic
+  // and this isn't an address conversion, use the operand for that
+  // intrinsic call directly rather than using the result of the intrinsic.
+  // This helps the jitter to generate better code when surface constants
+  // are used in send intructions.
+  if (Cat != RegCategory::ADDRESS) {
+    if (GenXIntrinsic::getGenXIntrinsicID(V) == GenXIntrinsic::genx_constanti)
+      V = cast<CallInst>(V)->getArgOperand(0);
+    return createConvert(V, V->getName() + ".categoryconv", nullptr,
+        Func->getParent());
+  }
+  return createConvertAddr(V, 0, V->getName() + ".categoryconv", nullptr,
+      Func->getParent());
+}
+
+/***********************************************************************
+ * Creates conversion instructions, places them in the function (next to the
+ * def)
+ *
+ * Returns an array of created conversion (cons[Category] holds
+ * instruction if we need conversion to such Category and nullptr otherwise).
+ * Doesn't produce address category conversion.
+ */
+GenXCategory::ConvListT
+GenXCategory::buildConversions(Value *Def, CategoryAndAlignment DefInfo,
+                               const UsesCatInfo &UsesInfo) {
+  ConvListT Convs = {nullptr};
+  for (auto Cat : UsesInfo.getCategories()) {
+    // NONE doesn't require conversion, ADDRESS requirs conversion before
+    // every use (not after def, so we won't create it here)
+    if (Cat != DefInfo.Cat && Cat != RegCategory::NONE &&
+        Cat != RegCategory::ADDRESS) {
+      auto Conv = createConversion(Def, Cat);
+      placeConvAfterDef(Func, Conv, Def);
+      Liveness->getOrCreateLiveRange(Conv)->setCategory(Cat);
+      Convs[Cat] = Conv;
+    }
+  }
+  return Convs;
+}
+
+/***********************************************************************
+ * intrinsicCategoryToRegCategory : convert intrinsic arg category to
+ *      register category
+ *
+ * This converts a GenXIntrinsicInfo::* category, as returned by
+ * GenXIntrinsicInfo::ArgInfo::getCategory(), into a register category
+ * as stored in a live range.
+ */
+static unsigned intrinsicCategoryToRegCategory(unsigned ICat)
+{
+  switch (ICat) {
+    case GenXIntrinsicInfo::ADDRESS:
+      return RegCategory::ADDRESS;
+    case GenXIntrinsicInfo::PREDICATION:
+    case GenXIntrinsicInfo::PREDICATE:
+      return RegCategory::PREDICATE;
+    case GenXIntrinsicInfo::SAMPLER:
+      return RegCategory::SAMPLER;
+    case GenXIntrinsicInfo::SURFACE:
+      return RegCategory::SURFACE;
+    case GenXIntrinsicInfo::VME:
+      return RegCategory::VME;
+    default:
+      return RegCategory::GENERAL;
+  }
+}
+
+/***********************************************************************
+ * getCategoryAndAlignmentForDef : get register category and alignment for a def
+ *
+ * This returns RegCategory:: value, or RegCategory::NONE if no category
+ * is discernable.
+ */
+CategoryAndAlignment GenXCategory::getCategoryAndAlignmentForDef(Value *V) const
+{
+  if (V->getType()->getScalarType()->getPrimitiveSizeInBits() == 1)
+    return RegCategory::PREDICATE;
+  if (Argument *Arg = dyn_cast<Argument>(V)) {
+    // This is a function Argument.
+    if (!InFGHead) {
+      // It is an arg in a subroutine.  Get the category from the corresponding
+      // arg at some call site.  (We should not have disagreement among the
+      // call sites and the function arg, since whichever one gets a category
+      // first forces the category of all the others.)
+      return getCategoryForCallArg(Arg->getParent(), Arg->getArgNo());
+    }
+    unsigned ArgNo = Arg->getArgNo();
+    if (KM.getNumArgs() > ArgNo) {
+      // The function is a kernel, and has argument kind metadata for
+      // this argument. Determine the category from the kind.
+      return KM.getArgCategory(ArgNo);
+    }
+    // The function is not a kernel, or does not have the appropriate
+    // metadata. Set to no particular category, so the arg's uses will
+    // determine the category. This is the fallback for compatibility with
+    // hand coded LLVM IR from before this metadata was added. (If we only
+    // had to cope with non-kernel functions, we could just return GENERAL.)
+    return RegCategory::NONE;
+  }
+  // The def is a phi-instruction.
+  if (PHINode *Phi = dyn_cast<PHINode>(V)) {
+    // This is a phi node. Get the category from one of the incomings. (We
+    // should not have disagreement among the incomings, since whichever
+    // one gets a category first forces the category of all the others.)
+    return getCategoryForPhiIncomings(Phi);
+  }
+  // Multiple outputs of inline assembly instruction
+  // result in a structure and those elements are extracted
+  // with extractelement
+  if (ExtractValueInst *Extract = dyn_cast<ExtractValueInst>(V)) {
+    auto CI = dyn_cast<CallInst>(Extract->getAggregateOperand());
+    if (CI && CI->isInlineAsm())
+      return getCategoryForInlasmConstraintedOp(CI, Extract->getIndices()[0],
+                                                true /*IsOutput*/);
+  }
+  // The def is a call-inst
+  if (CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (Function *Callee = CI->getCalledFunction()) {
+      unsigned IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(Callee);
+      // We should not see genx_convert, as it is inserted into a value after
+      // using this function to determine its category.
+      assert(IntrinsicID != GenXIntrinsic::genx_convert);
+      if (IntrinsicID == GenXIntrinsic::genx_convert_addr)
+        return RegCategory::ADDRESS;
+      if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IntrinsicID) && !GenXIntrinsic::isRdRegion(IntrinsicID)
+          && !GenXIntrinsic::isWrRegion(IntrinsicID) && !GenXIntrinsic::isAbs(IntrinsicID)) {
+        // For any normal intrinsic, look up the argument class.
+        GenXIntrinsicInfo II(IntrinsicID);
+        auto AI = II.getRetInfo();
+        return CategoryAndAlignment(
+            intrinsicCategoryToRegCategory(AI.getCategory()),
+            AI.getLogAlignment());
+      } else if (GenXIntrinsic::isRdRegion(IntrinsicID)) {
+        // Add this to avoid conversion in case of read-region on SurfaceIndex
+        // or SamplerIndex type
+        auto RC = getCategoryAndAlignmentForDef(
+            CI->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+        if (RC.Cat == RegCategory::SURFACE ||
+            RC.Cat == RegCategory::SAMPLER)
+          return RC.Cat;
+      }
+    } else if (CI->isInlineAsm()) {
+      return getCategoryForInlasmConstraintedOp(CI, 0, true /*IsOutput*/);
+    }
+  }
+  return RegCategory::GENERAL;
+}
+
+/***********************************************************************
+ * getCategoryForInlasmConstraintedOp : get register category for a
+ *                            operand of inline assembly (both for
+ *                            output and for input). Category of
+ *                            operand depends on its constraint.
+ *
+ */
+unsigned GenXCategory::getCategoryForInlasmConstraintedOp(CallInst *CI,
+                                                          unsigned ArgNo,
+                                                          bool IsOutput) const {
+  assert(CI->isInlineAsm() && "Inline asm expected");
+  InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
+  assert(!IA->getConstraintString().empty() && "Here should be constraints");
+
+  auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI);
+
+  if (!IsOutput)
+    ArgNo += genx::getInlineAsmNumOutputs(CI);
+  auto Info = ConstraintsInfo[ArgNo];
+
+  switch (Info.getConstraintType()) {
+  default:
+    llvm_unreachable("unreachable while setting category in constraints");
+  case ConstraintType::Constraint_a:
+  case ConstraintType::Constraint_rw:
+  case ConstraintType::Constraint_r:
+    return RegCategory::GENERAL;
+  case ConstraintType::Constraint_n:
+  case ConstraintType::Constraint_i:
+  case ConstraintType::Constraint_F:
+    return RegCategory::NONE;
+  case ConstraintType::Constraint_cr:
+    return RegCategory::PREDICATE;
+  }
+}
+
+/***********************************************************************
+ * getCategoryAndAlignmentForUse : get register category for a use
+ *
+ * This returns RegCategory:: value, or RegCategory::NONE if no category
+ * is discernable.
+ */
+CategoryAndAlignment GenXCategory::getCategoryAndAlignmentForUse(
+      Value::use_iterator U) const
+{
+  Value *V = U->get();
+  if (V->getType()->getScalarType()->isIntegerTy(1))
+    return RegCategory::PREDICATE;
+  auto user = cast<Instruction>(U->getUser());
+  if (PHINode *Phi = dyn_cast<PHINode>(user)) {
+    // This is a phi node. Get the category (if any) from the result, or from
+    // one of the incomings. (We should not have disagreement among the
+    // incomings, since whichever one gets a category first forces the category
+    // of all the others.)
+    if (auto LR = Liveness->getLiveRangeOrNull(Phi)) {
+      auto Cat = LR->getCategory();
+      if (Cat != RegCategory::NONE)
+        return Cat;
+    }
+    return getCategoryForPhiIncomings(Phi);
+  }
+  unsigned Category = RegCategory::GENERAL;
+  if (IGCLLVM::CallInst *CI = dyn_cast<IGCLLVM::CallInst>(user)) {
+    if (CI->isInlineAsm())
+      Category = getCategoryForInlasmConstraintedOp(CI, U->getOperandNo(),
+                                                    false /*IsOutput*/);
+    else if (CI->isIndirectCall())
+      Category = RegCategory::GENERAL;
+    else {
+      Function *Callee = CI->getCalledFunction();
+      unsigned IntrinID = GenXIntrinsic::not_any_intrinsic;
+      if (Callee)
+        IntrinID = GenXIntrinsic::getAnyIntrinsicID(Callee);
+      // We should not see genx_convert, as it is inserted into a value after
+      // using this function to determine its category.
+      assert(IntrinID != GenXIntrinsic::genx_convert);
+      // For a read or write region or element intrisic, where the use we have
+      // is the address, mark as needing an address register.
+      switch (IntrinID) {
+        case GenXIntrinsic::not_any_intrinsic:
+          // Arg in subroutine call. Get the category from the function arg,
+          // or the arg at another call site. (We should not have disagreement
+          // among the call sites and the function arg, since whichever one
+          // gets a category first forces the category of all the others.)
+          Category = getCategoryForCallArg(Callee, U->getOperandNo());
+          break;
+        case GenXIntrinsic::genx_convert_addr:
+          Category = RegCategory::GENERAL;
+          break;
+        case GenXIntrinsic::genx_rdregioni:
+        case GenXIntrinsic::genx_rdregionf:
+          if (U->getOperandNo() == 4) // is addr-operand
+            Category = RegCategory::ADDRESS;
+          else if (GenXIntrinsic::GenXRegion::OldValueOperandNum == U->getOperandNo())
+            Category = RegCategory::NONE; // do not assign use-category
+          break;
+        case GenXIntrinsic::genx_wrregioni:
+        case GenXIntrinsic::genx_wrregionf:
+          if (U->getOperandNo() == 5) // is addr-operand
+            Category = RegCategory::ADDRESS;
+           break;
+        case GenXIntrinsic::genx_absf:
+        case GenXIntrinsic::genx_absi:
+        case GenXIntrinsic::genx_output:
+          break;
+        default: {
+            // For any other intrinsic, look up the argument class.
+            GenXIntrinsicInfo II(IntrinID);
+            auto AI = II.getArgInfo(U->getOperandNo());
+            return CategoryAndAlignment(
+                intrinsicCategoryToRegCategory(AI.getCategory()),
+                AI.getLogAlignment());
+          }
+          break;
+          }
+    }
+  }
+  return Category;
+}
+
+/***********************************************************************
+ * getCategoryForPhiIncomings : get register category from phi incomings
+ *
+ * Return:  register category from a non-const incoming with a known category
+ *          else NONE if at least one incoming is non-constant
+ *          else GENERAL
+ *
+ * We will not have disagreement among the incomings, since whichever one gets
+ * a category first forces the category of all the others.
+ */
+unsigned GenXCategory::getCategoryForPhiIncomings(PHINode *Phi) const
+{
+  bool AllConst = true;
+  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+    Value *Incoming = Phi->getIncomingValue(i);
+    if (!isa<Constant>(Incoming)) {
+      AllConst = false;
+      if (auto LR = Liveness->getLiveRangeOrNull(Incoming)) {
+        unsigned Cat = LR->getCategory();
+        if (Cat != RegCategory::NONE)
+          return Cat;
+      }
+    }
+  }
+  if (AllConst) {
+    // All incomings are constant. Arbitrarily make the phi node value
+    // general category.
+    return RegCategory::GENERAL;
+  }
+  // No incoming has a category yet.
+  return RegCategory::NONE;
+}
+
+/***********************************************************************
+ * getCategoryForCallArg : get register category from subroutine arg or
+ *        the corresponding arg at some call site
+ *
+ * Enter:   Callee = function being called
+ *          ArgNo = argument number
+ *
+ * Return:  register category from subroutine arg or a call arg with a
+ *          known category, else NONE if no category found
+ *
+ * We will not have disagreement among the subroutine arg and its corresponding
+ * call args, since whichever one gets a category first forces the category of
+ * all the others.
+ */
+unsigned GenXCategory::getCategoryForCallArg(Function *Callee, unsigned ArgNo) const
+{
+  assert(Callee);
+  // First try the subroutine arg.
+  auto ai = Callee->arg_begin();
+  for (unsigned i = 0; i != ArgNo; ++i, ++ai)
+    ;
+  if (auto LR = Liveness->getLiveRangeOrNull(&*ai)) {
+    unsigned Cat = LR->getCategory();
+    if (Cat != RegCategory::NONE)
+      return Cat;
+  }
+  // Then try the arg at each call site.
+  bool UseUndef = true;
+  for (auto ui = Callee->use_begin(), ue = Callee->use_end(); ui != ue; ++ui) {
+    if (auto CI = dyn_cast<CallInst>(ui->getUser())) {
+      auto ArgV = CI->getArgOperand(ArgNo);
+      if (!isa<UndefValue>(ArgV)) {
+        UseUndef = false;
+        if (auto LR = Liveness->getLiveRangeOrNull(ArgV)) {
+          unsigned Cat = LR->getCategory();
+          if (Cat != RegCategory::NONE)
+            return Cat;
+        }
+      }
+    }
+  }
+  // special case handling to break deadlock when all uses are undef,
+  // force the argument to be GENERAL
+  return(UseUndef ? RegCategory::GENERAL : RegCategory::NONE);
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXCisaBuilder.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCisaBuilder.cpp
new file mode 100644
index 000000000000..dc48d296615a
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCisaBuilder.cpp
@@ -0,0 +1,5779 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXCisaBuilder
+/// ---------------
+///
+/// This file contains to passes: GenXCisaBuilder and GenXFinalizer.
+///
+/// 1. GenXCisaBuilder transforms LLVM IR to CISA IR via Finalizer' public API.
+///    It is a FunctionGroupPass, thus it runs once for each kernel and creates
+///    CISA IR for it and all its subroutines.
+///    Real building of kernels is performed by the GenXKernelBuilder class.
+///    This splitting is necessary because GenXCisaBuilder object lives
+///    through all Function Groups, but we don't need to keep all Kernel
+///    building specific data in such lifetime.
+///
+/// 2. GenXFinalizer is a module pass, thus it runs once and all that it does
+///    is a running of Finalizer for kernels created in GenXCisaBuilder pass.
+///
+//===----------------------------------------------------------------------===//
+
+#include "GenXGotoJoin.h"
+#include "GenXIntrinsics.h"
+#include "GenXOCLRuntimeInfo.h"
+#include "GenXPressureTracker.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "GenXVisaRegAlloc.h"
+#include "common.h"
+#include "vc/GenXOpts/Utils/KernelInfo.h"
+#include "visaBuilder_interface.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsicInst.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+#include "llvmWrapper/IR/InstrTypes.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+// If 1, print VISA instructions after corresponding LLVM instruction.
+// Only for debug purposes, uses Finalizer internal API.
+#define DUMP_VISA_INTSTRUCTIONS 0
+
+#if DUMP_VISA_INTSTRUCTIONS
+#include "Common_ISA_framework.h"
+#include "IsaDisassembly.h"
+#include "Mem_Manager.h"
+#include "VISAKernel.h"
+#endif
+
+#ifndef COMMON_ISA_MAX_FILENAME_LENGTH
+#define COMMON_ISA_MAX_FILENAME_LENGTH 1023
+#endif
+
+using namespace llvm;
+using namespace genx;
+
+#define DEBUG_TYPE "GENX_CISA_BUILDER"
+
+static cl::opt<bool> EmitVisa("emit-visa", cl::init(false), cl::Hidden,
+                              cl::desc("Generate Visa instead of fat binary."));
+static cl::list<std::string>
+    FinalizerOpts("finalizer-opts", cl::Hidden, cl::ZeroOrMore,
+                  cl::desc("Additional options for finalizer."));
+
+static cl::opt<std::string> AsmNameOpt("asm-name", cl::init(""), cl::Hidden,
+    cl::desc("Output assembly code to this file during compilation."));
+
+static cl::opt<bool> ReverseKernels("reverse-kernels", cl::init(false), cl::Hidden,
+    cl::desc("Emit the kernel asm name in reversed order (if user asm name presented)."));
+
+static cl::opt<bool>
+    PrintFinalizerOptions("cg-print-finalizer-args", cl::init(false), cl::Hidden,
+                          cl::desc("Prints options used to invoke finalizer"));
+
+enum {
+  BYTES_PER_OWORD = 16,
+  BYTES_PER_FADDR = 8,
+  // stackcall ABI related constants
+  ARG_SIZE_IN_GRFS = 32,
+  RET_SIZE_IN_GRFS = 12,
+  STACK_PER_THREAD = 256
+};
+
+/// For VISA_PREDICATE_CONTROL & VISA_PREDICATE_STATE
+template <class T> T &operator^=(T &a, T b) {
+  using _T = typename std::underlying_type<T>::type;
+  static_assert(std::is_integral<_T>::value,
+                "Wrong operation for non-integral type");
+  a = static_cast<T>(static_cast<_T>(a) ^ static_cast<_T>(b));
+  return a;
+}
+
+template <class T> T operator|=(T &a, T b) {
+  using _T = typename std::underlying_type<T>::type;
+  static_assert(std::is_integral<_T>::value,
+                "Wrong operation for non-integral type");
+  a = static_cast<T>(static_cast<_T>(a) | static_cast<_T>(b));
+  return a;
+}
+
+struct DstOpndDesc {
+  Instruction *WrRegion = nullptr;
+  Instruction *GStore = nullptr;
+  genx::BaleInfo WrRegionBI;
+};
+
+namespace {
+
+// Diagnostic information for errors/warnings in the GEN IR building passes.
+class DiagnosticInfoCisaBuild : public DiagnosticInfo {
+private:
+  std::string Description;
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoCisaBuild(const Twine &Desc, DiagnosticSeverity Severity)
+      : DiagnosticInfo(getKindID(), Severity) {
+    Description = (Twine("GENX IR generation error: ") + Desc).str();
+  }
+
+  void print(DiagnosticPrinter &DP) const override { DP << Description; }
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+int DiagnosticInfoCisaBuild::KindID = 0;
+
+
+static VISA_Exec_Size getExecSizeFromValue(unsigned int Size) {
+  int Res = genx::log2(Size);
+  assert(std::bitset<sizeof(unsigned int) * 8>(Size).count() <= 1);
+  assert(Res <= 5 &&
+         "illegal common ISA execsize (should be 1, 2, 4, 8, 16, 32).");
+  return Res == -1 ? EXEC_SIZE_ILLEGAL : (VISA_Exec_Size)Res;
+}
+
+static VISA_Oword_Num getCisaOwordNumFromNumber(unsigned num) {
+  switch (num) {
+  case 1:
+    return OWORD_NUM_1;
+  case 2:
+    return OWORD_NUM_2;
+  case 4:
+    return OWORD_NUM_4;
+  case 8:
+    return OWORD_NUM_8;
+  case 16:
+    return OWORD_NUM_16;
+  default:
+    MUST_BE_TRUE(false, "illegal Oword number.");
+    return OWORD_NUM_ILLEGAL;
+  }
+}
+
+VISAChannelMask convertChannelMaskToVisaType(unsigned Mask) {
+  switch (Mask & 0xf) {
+  case 1:
+    return CHANNEL_MASK_R;
+  case 2:
+    return CHANNEL_MASK_G;
+  case 3:
+    return CHANNEL_MASK_RG;
+  case 4:
+    return CHANNEL_MASK_B;
+  case 5:
+    return CHANNEL_MASK_RB;
+  case 6:
+    return CHANNEL_MASK_GB;
+  case 7:
+    return CHANNEL_MASK_RGB;
+  case 8:
+    return CHANNEL_MASK_A;
+  case 9:
+    return CHANNEL_MASK_RA;
+  case 10:
+    return CHANNEL_MASK_GA;
+  case 11:
+    return CHANNEL_MASK_RGA;
+  case 12:
+    return CHANNEL_MASK_BA;
+  case 13:
+    return CHANNEL_MASK_RBA;
+  case 14:
+    return CHANNEL_MASK_GBA;
+  case 15:
+    return CHANNEL_MASK_RGBA;
+  default:
+    llvm_unreachable("Wrong mask");
+  }
+}
+
+CHANNEL_OUTPUT_FORMAT getChannelOutputFormat(uint8_t ChannelOutput) {
+  return (CHANNEL_OUTPUT_FORMAT)((ChannelOutput >> 4) & 0x3);
+}
+
+std::string cutString(std::string Str) {
+  // vISA is limited to 64 byte strings. But old fe-compiler seems to ignore
+  // that for source filenames.
+  if (Str.size() > 64)
+    Str.erase(64);
+  return Str;
+}
+
+void handleCisaCallError(int CallResult, const Twine &Call, LLVMContext &Ctx) {
+  StringRef ErrorType;
+  switch (CallResult) {
+  case VISA_SPILL:
+    ErrorType = "register allocation for a kernel failed, even with spill code";
+    break;
+  case VISA_FAILURE:
+    ErrorType = "general failure";
+    break;
+  default:
+    ErrorType = "unknown error";
+    break;
+  }
+#ifndef NDEBUG
+  DiagnosticInfoCisaBuild Err(
+      "VISA builder API call failed (" + Call + "): " + ErrorType, DS_Error);
+#else
+  DiagnosticInfoCisaBuild Err("VISA builder API call failed: " + ErrorType,
+                              DS_Error);
+#endif
+  Ctx.diagnose(Err);
+}
+
+} // namespace
+
+#define CISA_CALL(c)                                                           \
+  do {                                                                         \
+    auto result = c;                                                           \
+    if (result != VISA_SUCCESS) {                                              \
+      handleCisaCallError(result, #c, getContext());                           \
+    }                                                                          \
+  } while (0);
+
+namespace llvm {
+
+static VISA_Type getVisaTypeFromBytesNumber(unsigned BytesNum, bool IsFloat,
+                                            genx::Signedness Sign) {
+  VISA_Type aliasType;
+  if (IsFloat) {
+    switch (BytesNum) {
+    case 2:
+      aliasType = ISA_TYPE_HF;
+      break;
+    case 4:
+      aliasType = ISA_TYPE_F;
+      break;
+    case 8:
+      aliasType = ISA_TYPE_DF;
+      break;
+    default:
+      report_fatal_error("unknown float type");
+      break;
+    }
+  } else {
+    switch (BytesNum) {
+    case 1:
+      aliasType = (Sign == SIGNED) ? ISA_TYPE_B : ISA_TYPE_UB;
+      break;
+    case 2:
+      aliasType = (Sign == SIGNED) ? ISA_TYPE_W : ISA_TYPE_UW;
+      break;
+    case 4:
+      aliasType = (Sign == SIGNED) ? ISA_TYPE_D : ISA_TYPE_UD;
+      break;
+    case 8:
+      aliasType = (Sign == SIGNED) ? ISA_TYPE_Q : ISA_TYPE_UQ;
+      break;
+    default:
+      report_fatal_error("unknown integer type");
+      break;
+    }
+  }
+  return aliasType;
+}
+
+static VISA_Type llvmToVisaType(Type *Type,
+                                genx::Signedness Sign = DONTCARESIGNED) {
+  auto T = Type;
+  assert(!T->isAggregateType());
+  VISA_Type Result = ISA_TYPE_NUM;
+  if (T->isVectorTy() && T->getVectorElementType()->isIntegerTy(1)) {
+    switch (Type->getVectorNumElements()) {
+    case 8:
+      Result = (Sign == SIGNED) ? ISA_TYPE_B : ISA_TYPE_UB;
+      break;
+    case 16:
+      Result = (Sign == SIGNED) ? ISA_TYPE_W : ISA_TYPE_UW;
+      break;
+    case 32:
+      Result = (Sign == SIGNED) ? ISA_TYPE_D : ISA_TYPE_UD;
+      break;
+    default:
+      report_fatal_error("only 8xi1 and 32xi1 are currently supported");
+      break;
+    }
+  } else {
+    if (T->isVectorTy())
+      T = T->getVectorElementType();
+    if (T->isPointerTy() && T->getPointerElementType()->isFunctionTy()) {
+      // we might have used DL to get the type size but that'd
+      // overcomplicate this function's type unnecessarily
+      Result = getVisaTypeFromBytesNumber(BYTES_PER_FADDR, false, DONTCARESIGNED);
+    } else {
+      assert(T->isFloatingPointTy() || T->isIntegerTy());
+      Result = getVisaTypeFromBytesNumber(T->getScalarSizeInBits() / CHAR_BIT,
+                                          T->isFloatingPointTy(), Sign);
+    }
+  }
+  assert(Result != ISA_TYPE_NUM);
+  return Result;
+}
+
+static VISA_Type llvmToVisaType(Value *V,
+                                genx::Signedness Sign = DONTCARESIGNED) {
+  return llvmToVisaType(V->getType(), Sign);
+}
+
+// Due to the lack of access to VISA_GenVar internal interfaces (concerning type, size, etc)
+// some local DS are required to store such info: CisaVariable and GenericCisaVariable.
+
+//===----------------------------------------------------------------------===//
+// CisaVariable
+// ------------------
+//
+// CisaVariable keeps VISA_GenVar of a specific VISA_Type and provides accessors
+// to its byte size and number of elements thus emulating some internal vISA machinery.
+//
+//===----------------------------------------------------------------------===//
+class CisaVariable {
+  VISA_Type Type;
+  unsigned ByteSize = 0;
+  VISA_GenVar *VisaVar = nullptr;
+
+public:
+  CisaVariable(VISA_Type T, unsigned BS, VISA_GenVar *V)
+      : Type(T), ByteSize(BS), VisaVar(V) {}
+
+  VISA_Type getType() const { return Type; }
+
+  VISA_GenVar *getGenVar() { return VisaVar; }
+
+  unsigned getByteSize() const { return ByteSize; }
+
+  unsigned getNumElements() const {
+    assert(!(ByteSize % CISATypeTable[Type].typeSize));
+    return ByteSize / CISATypeTable[Type].typeSize;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// GenericCisaVariable
+// ------------------
+//
+// GenericCisaVariable describes vISA value that isn't intended to have matching llvm::Value
+// (e.g. stack regs %arg and %retv). It provides interface to get a VisaVar alias with a specific
+// vISA type.
+//
+//===----------------------------------------------------------------------===//
+class GenericCisaVariable {
+  const char *Name = "";
+  VISA_GenVar *VisaVar = nullptr;
+  unsigned ByteSize = 0;
+
+  IndexedMap<CisaVariable *> AliasDecls;
+  std::list<CisaVariable> Storage;
+
+  unsigned getNumElements(VISA_Type T) const {
+    assert(!(ByteSize % CISATypeTable[T].typeSize));
+    return ByteSize / CISATypeTable[T].typeSize;
+  }
+
+public:
+  GenericCisaVariable(const char *Nm, VISA_GenVar *V, unsigned BS)
+      : Name(Nm), VisaVar(V), ByteSize(BS) {
+    AliasDecls.grow(ISA_TYPE_NUM);
+  }
+
+  CisaVariable *getAlias(Value *V, VISAKernel *K) {
+    return getAlias(llvmToVisaType(V), K);
+  }
+
+  CisaVariable *getAlias(VISA_Type T, VISAKernel *K) {
+    if (!AliasDecls[T]) {
+      VISA_GenVar *VV = nullptr;
+      K->CreateVISAGenVar(VV, Name, getNumElements(T), T, ALIGN_GRF, VisaVar);
+      Storage.push_back(CisaVariable(T, ByteSize, VV));
+      AliasDecls[T] = &Storage.back();
+    }
+    return AliasDecls[T];
+  }
+
+  unsigned getByteSize() const { return ByteSize; }
+};
+
+//===----------------------------------------------------------------------===//
+/// GenXCisaBuilder
+/// ------------------
+///
+/// This class encapsulates a creation of vISA kernels.
+/// It is a FunctionGroupPass, thus it runs once for each kernel and
+/// builds vISA kernel via class GenXKernelBuilder.
+/// All created kernels are stored in CISA Builder object which is provided
+/// by finalizer.
+///
+//===----------------------------------------------------------------------===//
+class GenXCisaBuilder : public FunctionGroupPass {
+  LLVMContext *Ctx = nullptr;
+
+public:
+  static char ID;
+  explicit GenXCisaBuilder() : FunctionGroupPass(ID) {}
+
+  virtual StringRef getPassName() const {
+    return "GenX CISA construction pass";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunctionGroup(FunctionGroup &FG);
+
+  LLVMContext &getContext() {
+    assert(Ctx);
+    return *Ctx;
+  }
+};
+
+void initializeGenXCisaBuilderPass(PassRegistry &);
+
+//===----------------------------------------------------------------------===//
+/// GenXKernelBuilder
+/// ------------------
+///
+/// This class does all the work for creation of vISA kernels.
+///
+//===----------------------------------------------------------------------===//
+class GenXKernelBuilder {
+  using Register = GenXVisaRegAlloc::Reg;
+
+  VISAKernel *MainKernel = nullptr;
+  VISAFunction *Kernel = nullptr;
+  genx::KernelMetadata TheKernelMetadata;
+  LLVMContext &Ctx;
+  const DataLayout &DL;
+
+  std::map<Function *, VISAFunction *> Func2Kern;
+
+  std::map<std::string, unsigned> StringPool;
+  std::vector<VISA_LabelOpnd *> Labels;
+  std::map<Value *, unsigned> LabelMap;
+
+  // loop info for each function
+  std::map<Function *, LoopInfoBase<BasicBlock, Loop> *> Loops;
+  ValueMap<Function *, bool> IsInLoopCache;
+
+  bool HasBarrier = false;
+  bool HasCallable = false;
+  bool HasStackcalls = false;
+  bool HasAlloca = false;
+  // GRF width in unit of byte
+  unsigned GrfByteSize = 32;
+
+  int LastLabel = 0;
+  unsigned LastLine = 0;
+  unsigned PendingLine = 0;
+  StringRef LastFilename;
+  StringRef PendingFilename;
+  StringRef LastDirectory;
+  StringRef PendingDirectory;
+
+  // function currently being written during constructor
+  Function *Func = nullptr;
+  // function corresponding to VISAKernel currently being written
+  Function *KernFunc = nullptr;
+  PreDefined_Surface StackSurf;
+
+  std::map<Function *, VISA_GenVar *> FPMap;
+  SmallVector<InsertValueInst *, 10> RetvInserts;
+
+  std::map<VISAKernel *, std::map<StringRef, GenericCisaVariable>> CisaVars;
+
+  // The default float control from kernel attribute. Each subroutine may
+  // overrride this control mask, but it should revert back to the default float
+  // control mask before exiting from the subroutine.
+  uint32_t DefaultFloatControl = 0;
+
+  static const uint32_t CR_Mask = 0x1 << 10 | 0x3 << 6 | 0x3 << 4 | 0x1;
+
+  // normally false, set to true if there is any SIMD CF in the func or this is
+  // (indirectly) called inside any SIMD CF.
+  bool NoMask = false;
+
+  genx::AlignmentInfo AI;
+
+public:
+  FunctionGroup *FG = nullptr;
+  GenXLiveness *Liveness = nullptr;
+  GenXNumbering *Numbering = nullptr;
+  GenXVisaRegAlloc *RegAlloc = nullptr;
+  FunctionGroupAnalysis *FGA = nullptr;
+  GenXModule *GM = nullptr;
+  DominatorTreeGroupWrapperPass *DTs = nullptr;
+  const GenXSubtarget *Subtarget = nullptr;
+  GenXBaling *Baling = nullptr;
+  VISABuilder *CisaBuilder = nullptr;
+
+private:
+  void collectKernelInfo();
+  void buildVariables();
+  void buildInstructions();
+
+  bool buildInstruction(Instruction *Inst);
+  bool buildMainInst(Instruction *Inst, genx::BaleInfo BI, unsigned Mod,
+                     const DstOpndDesc &DstDesc);
+  void buildControlRegUpdate(unsigned Mask, bool Clear);
+  void buildJoin(CallInst *Join, BranchInst *Branch);
+  bool buildBranch(BranchInst *Branch);
+  void buildIntrinsic(CallInst *CI, unsigned IntrinID, genx::BaleInfo BI,
+                      unsigned Mod, const DstOpndDesc &DstDesc);
+  void buildInputs(Function *F, bool NeedRetIP);
+
+  void buildFunctionAddr(Instruction *Inst, const DstOpndDesc &DstDesc);
+  void buildLoneWrRegion(const DstOpndDesc &Desc);
+  void buildLoneWrPredRegion(Instruction *Inst, genx::BaleInfo BI);
+  void buildLoneOperand(Instruction *Inst, genx::BaleInfo BI, unsigned Mod,
+                        const DstOpndDesc &DstDesc);
+
+  VISA_PredVar *getPredicateVar(Register *Idx);
+  VISA_PredVar *getPredicateVar(Value *V);
+  VISA_PredVar *getZeroedPredicateVar(Value *V);
+  VISA_EMask_Ctrl getExecMaskFromWrPredRegion(Instruction *WrPredRegion,
+                                                     bool IsNoMask);
+  VISA_EMask_Ctrl getExecMaskFromWrRegion(const DstOpndDesc &DstDesc,
+                                                 bool IsNoMask = false);
+  unsigned getOrCreateLabel(Value *V, int Kind);
+  int getLabel(Value *V);
+  void setLabel(Value *V, unsigned Num);
+
+  void emitOptimizationHints();
+
+  LoopInfoBase<BasicBlock, Loop> *getLoops(Function *F);
+  Value *getPredicateOperand(Instruction *Inst, unsigned OperandNum,
+                             genx::BaleInfo BI, VISA_PREDICATE_CONTROL &Control,
+                             VISA_PREDICATE_STATE &PredField,
+                             VISA_EMask_Ctrl *MaskCtrl);
+  bool isInLoop(BasicBlock *BB);
+
+  void addLabelInst(Value *BB);
+  void buildPhiNode(PHINode *Phi);
+  void buildGoto(CallInst *Goto, BranchInst *Branch);
+  void buildCall(IGCLLVM::CallInst *CI, const DstOpndDesc &DstDesc);
+  void buildStackCall(IGCLLVM::CallInst *CI, const DstOpndDesc &DstDesc);
+  void buildInlineAsm(CallInst *CI);
+  void buildPrintIndex(CallInst *CI, unsigned IntrinID, unsigned Mod,
+                       const DstOpndDesc &DstDesc);
+  void buildSelectInst(SelectInst *SI, genx::BaleInfo BI, unsigned Mod,
+                       const DstOpndDesc &DstDesc);
+  void buildBinaryOperator(BinaryOperator *BO, genx::BaleInfo BI, unsigned Mod,
+                           const DstOpndDesc &DstDesc);
+#if (LLVM_VERSION_MAJOR > 8)
+  void buildUnaryOperator(UnaryOperator *UO, genx::BaleInfo BI, unsigned Mod,
+                          const DstOpndDesc &DstDesc);
+#endif
+  void buildBoolBinaryOperator(BinaryOperator *BO);
+  void buildSymbolInst(PtrToIntInst *ptr2Int, unsigned Mod,
+                       const DstOpndDesc &DstDesc);
+  void buildCastInst(CastInst *CI, genx::BaleInfo BI, unsigned Mod,
+                     const DstOpndDesc &DstDesc);
+  void buildConvertAddr(CallInst *CI, genx::BaleInfo BI, unsigned Mod,
+                        const DstOpndDesc &DstDesc);
+  void buildAlloca(CallInst *CI, unsigned IntrinID, unsigned Mod,
+                   const DstOpndDesc &DstDesc);
+  void addWriteRegionLifetimeStartInst(Instruction *WrRegion);
+  void addLifetimeStartInst(Instruction *Inst);
+  void AddGenVar(Register &Reg);
+  void buildRet(ReturnInst *RI);
+  void buildBitCast(CastInst *CI, genx::BaleInfo BI, unsigned Mod,
+                    const DstOpndDesc &DstDesc);
+  void buildCmp(CmpInst *Cmp, genx::BaleInfo BI, const DstOpndDesc &DstDesc);
+  void buildExtractRetv(ExtractValueInst *Inst);
+  void buildInsertRetv(InsertValueInst *Inst);
+
+  VISA_VectorOpnd *createState(Register *Reg, unsigned Offset, bool IsDst);
+  VISA_Type getVISAImmTy(uint8_t ImmTy);
+
+  VISA_PredOpnd *createPredOperand(VISA_PredVar *PredVar,
+                                   VISA_PREDICATE_STATE State,
+                                   VISA_PREDICATE_CONTROL Control);
+
+  VISA_VectorOpnd *createCisaSrcOperand(VISA_GenVar *Decl, VISA_Modifier Mod,
+                                        unsigned VStride, unsigned Width,
+                                        unsigned HStride, unsigned ROffset,
+                                        unsigned COffset);
+
+  VISA_VectorOpnd *createCisaDstOperand(VISA_GenVar *Decl, unsigned HStride,
+                                        unsigned ROffset, unsigned COffset);
+
+  VISA_VectorOpnd *createDestination(Value *Dest, genx::Signedness Signed,
+                                     unsigned Mod, const DstOpndDesc &DstDesc,
+                                     genx::Signedness *SignedRes = nullptr,
+                                     unsigned *Offset = nullptr);
+  VISA_VectorOpnd *createDestination(CisaVariable *Dest,
+                                     genx::Signedness Signed,
+                                     unsigned *Offset = nullptr);
+  VISA_VectorOpnd *createDestination(Value *Dest,
+                                     genx::Signedness Signed,
+                                     unsigned *Offset = nullptr);
+  VISA_VectorOpnd *createSourceOperand(Instruction *Inst,
+                                       genx::Signedness Signed,
+                                       unsigned OperandNum, genx::BaleInfo BI,
+                                       unsigned Mod = 0,
+                                       genx::Signedness *SignedRes = nullptr,
+                                       unsigned MaxWidth = 16);
+  VISA_VectorOpnd *createSource(CisaVariable *V, genx::Signedness Signed,
+                                unsigned MaxWidth = 16,
+                                unsigned *Offset = nullptr);
+  VISA_VectorOpnd *createSource(Value *V, genx::Signedness Signed, bool Baled,
+                                unsigned Mod = 0,
+                                genx::Signedness *SignedRes = nullptr,
+                                unsigned MaxWidth = 16,
+                                unsigned *Offset = nullptr);
+  VISA_VectorOpnd *createSource(Value *V, genx::Signedness Signed,
+                                unsigned MaxWidth = 16,
+                                unsigned *Offset = nullptr);
+
+  std::string createInlineAsmOperand(Register *Reg, genx::Region *R, bool IsDst,
+                                     genx::Signedness Signed,
+                                     genx::ConstraintType Ty, unsigned Mod);
+
+  std::string createInlineAsmSourceOperand(Value *V, genx::Signedness Signed,
+                                           bool Baled, genx::ConstraintType Ty,
+                                           unsigned Mod = 0,
+                                           unsigned MaxWidth = 16);
+
+  std::string createInlineAsmDestinationOperand(Value *Dest,
+                                                genx::Signedness Signed,
+                                                genx::ConstraintType Ty,
+                                                unsigned Mod,
+                                                const DstOpndDesc &DstDesc);
+
+  VISA_VectorOpnd *createImmediateOperand(Constant *V, genx::Signedness Signed);
+
+  VISA_PredVar *createPredicateDeclFromSelect(Instruction *SI,
+                                              genx::BaleInfo BI,
+                                              VISA_PREDICATE_CONTROL &Control,
+                                              VISA_PREDICATE_STATE &PredField,
+                                              VISA_EMask_Ctrl *MaskCtrl);
+
+  VISA_RawOpnd *createRawSourceOperand(Instruction *Inst, unsigned OperandNum,
+                                       genx::BaleInfo BI,
+                                       genx::Signedness Signed);
+  VISA_RawOpnd *createRawDestination(Value *V, const DstOpndDesc &DstDesc,
+                                     genx::Signedness Signed);
+
+  VISA_VectorOpnd *createAddressOperand(Value *V, bool IsDst);
+
+  void addDebugInfo();
+
+  void deduceRegion(Region *R, bool IsDest, unsigned MaxWidth = 16);
+
+  VISA_VectorOpnd *createGeneralOperand(genx::Region *R, VISA_GenVar *Decl,
+                                        genx::Signedness Signed, unsigned Mod,
+                                        bool IsDest, unsigned MaxWidth = 16);
+  VISA_VectorOpnd *createIndirectOperand(genx::Region *R,
+                                         genx::Signedness Signed, unsigned Mod,
+                                         bool IsDest, unsigned MaxWidth = 16);
+  VISA_VectorOpnd *createRegionOperand(genx::Region *R, VISA_GenVar *Decl,
+                                       genx::Signedness Signed, unsigned Mod,
+                                       bool IsDest, unsigned MaxWidth = 16);
+  VISA_PredOpnd *createPredFromWrRegion(const DstOpndDesc &DstDesc);
+
+  VISA_PredOpnd *createPred(Instruction *Inst, genx::BaleInfo BI,
+                            unsigned OperandNum);
+
+  Instruction *getOriginalInstructionForSource(Instruction *CI,
+                                               genx::BaleInfo BI);
+  void buildConvert(CallInst *CI, genx::BaleInfo BI, unsigned Mod,
+                    const DstOpndDesc &DstDesc);
+  std::string buildAsmName() const;
+  void beginFunction(Function *Func);
+  void endFunction(Function *Func, ReturnInst *RI);
+
+  unsigned getFuncArgsSize(Function *F);
+  unsigned getValueSize(Type *T, unsigned Mod = 32) const;
+  unsigned getValueSize(CisaVariable *V) const {
+    return V->getByteSize();
+  }
+  unsigned getValueSize(Value *V, unsigned Mod = 32) const {
+    return getValueSize(V->getType(), Mod);
+  }
+  GenericCisaVariable *createCisaVariable(VISAKernel *Kernel, const char *Name,
+                                   VISA_GenVar *AliasVar, unsigned ByteSize);
+
+  template <typename T1, typename T2>
+  void emitVectorCopy(
+      T1 *Dst, T2 *Src, unsigned &RowOff, unsigned &ColOff, unsigned &SrcRowOff,
+      unsigned &SrcColOff, int TotalSize, bool DoCopy = true);
+
+  void pushStackArg(VISA_StateOpndHandle *Dst, Value *Src, int TotalSz,
+                    unsigned &RowOff, unsigned &ColOff, unsigned &SrcRowOff,
+                    unsigned &SrcColOff, bool DoCopy = true);
+  void popStackArg(Value *Dst, VISA_StateOpndHandle *Src, int TotalSz,
+                   unsigned &RowOff, unsigned &ColOff, unsigned &SrcRowOff,
+                   unsigned &SrcColOff, int &PrevStackOff);
+
+public:
+  GenXKernelBuilder(FunctionGroup *FG)
+      : TheKernelMetadata(FG->getHead()), Ctx(FG->getContext()),
+        DL(FG->getModule()->getDataLayout()), FG(FG) {
+    collectKernelInfo();
+  }
+  ~GenXKernelBuilder() { clearLoops(); }
+  void clearLoops() {
+    for (auto i = Loops.begin(), e = Loops.end(); i != e; ++i) {
+      delete i->second;
+      i->second = nullptr;
+    }
+    Loops.clear();
+  }
+
+  bool run(std::string &KernelNameBuf);
+
+  LLVMContext &getContext() { return Ctx; }
+
+  unsigned addStringToPool(StringRef Str);
+  StringRef getStringByIndex(unsigned Val);
+};
+
+} // end namespace llvm
+
+char GenXCisaBuilder::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXCisaBuilder, "GenXCisaBuilderPass",
+                      "GenXCisaBuilderPass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_DEPENDENCY(GenXVisaRegAlloc)
+INITIALIZE_PASS_DEPENDENCY(GenXModule)
+INITIALIZE_PASS_END(GenXCisaBuilder, "GenXCisaBuilderPass",
+                    "GenXCisaBuilderPass", false, false)
+
+FunctionGroupPass *llvm::createGenXCisaBuilderPass() {
+  initializeGenXCisaBuilderPass(*PassRegistry::getPassRegistry());
+  return new GenXCisaBuilder();
+}
+
+void GenXCisaBuilder::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeGroupWrapperPass>();
+  AU.addRequired<GenXGroupBaling>();
+  AU.addRequired<GenXLiveness>();
+  AU.addRequired<GenXVisaRegAlloc>();
+  AU.addRequired<GenXModule>();
+  AU.addRequired<FunctionGroupAnalysis>();
+  AU.setPreservesAll();
+}
+
+bool GenXCisaBuilder::runOnFunctionGroup(FunctionGroup &FG) {
+  Ctx = &FG.getContext();
+  std::unique_ptr<GenXKernelBuilder> KernelBuilder(new GenXKernelBuilder(&FG));
+  KernelBuilder->FGA = getAnalysisIfAvailable<FunctionGroupAnalysis>();
+  KernelBuilder->GM = getAnalysisIfAvailable<GenXModule>();
+  KernelBuilder->CisaBuilder = KernelBuilder->GM->GetCisaBuilder();
+  KernelBuilder->RegAlloc = getAnalysisIfAvailable<GenXVisaRegAlloc>();
+  KernelBuilder->Baling = &getAnalysis<GenXGroupBaling>();
+  KernelBuilder->DTs = &getAnalysis<DominatorTreeGroupWrapperPass>();
+  KernelBuilder->Liveness = &getAnalysis<GenXLiveness>();
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  KernelBuilder->Subtarget = P ? P->getSubtarget() : nullptr;
+
+  std::string KernelName;
+  KernelBuilder->run(KernelName);
+
+  GenXModule *GM = KernelBuilder->GM;
+  VISABuilder *VisaBuilder = GM->GetCisaBuilder();
+  if (GM->HasInlineAsm()) {
+    CISA_CALL(KernelBuilder->CisaBuilder->WriteVISAHeader());
+    auto VISAAsmTextReader = GM->GetVISAAsmReader();
+    auto VISATextHeader =
+        KernelBuilder->CisaBuilder->GetAsmTextHeaderStream().str();
+    auto VISAText = KernelBuilder->CisaBuilder->GetAsmTextStream().str();
+    CISA_CALL(VISAAsmTextReader->ParseVISAText(VISATextHeader, VISAText, ""));
+    VisaBuilder = VISAAsmTextReader;
+  }
+  for (auto &F : FG) {
+    if (genx::isKernel(F)) {
+      VISAKernel *BuiltKernel = VisaBuilder->GetVISAKernel(KernelName.c_str());
+      GM->saveVisaKernel(F, BuiltKernel);
+    } else if (F->hasFnAttribute(genx::FunctionMD::CMStackCall)) {
+      VISAKernel *BuiltKernel = VisaBuilder->GetVISAKernel(F->getName());
+      GM->saveVisaKernel(F, BuiltKernel);
+    }
+  }
+
+  return false;
+}
+
+static bool isDerivedFromUndef(Constant *C) {
+  if (isa<UndefValue>(C))
+    return true;
+  if (!isa<ConstantExpr>(C))
+    return false;
+  ConstantExpr *CE = cast<ConstantExpr>(C);
+  for (auto &Opnd : CE->operands())
+    if (isDerivedFromUndef(cast<Constant>(Opnd)))
+      return true;
+  return false;
+}
+
+static unsigned get8bitPackedFloat(float f) {
+  union {
+    float f;
+    unsigned u;
+  } u;
+
+  u.f = f;
+  unsigned char Sign = (u.u >> 31) << 7;
+  unsigned Exp = (u.u >> 23) & 0xFF;
+  unsigned Frac = u.u & 0x7FFFFF;
+  if (Exp == 0 && Frac == 0)
+    return Sign;
+
+  assert(Exp >= 124 && Exp <= 131);
+  Exp -= 124;
+  assert((Frac & 0x780000) == Frac);
+  Frac >>= 19;
+  assert(!(Exp == 124 && Frac == 0));
+
+  Sign |= (Exp << 4);
+  Sign |= Frac;
+
+  return Sign;
+}
+
+static Signedness getISatSrcSign(unsigned IID) {
+  switch (IID) {
+  case GenXIntrinsic::genx_sstrunc_sat:
+  case GenXIntrinsic::genx_ustrunc_sat:
+    return SIGNED;
+  case GenXIntrinsic::genx_sutrunc_sat:
+  case GenXIntrinsic::genx_uutrunc_sat:
+    return UNSIGNED;
+  default:
+    return DONTCARESIGNED;
+  }
+}
+
+static Signedness getISatDstSign(unsigned IID) {
+  switch (IID) {
+  case GenXIntrinsic::genx_sstrunc_sat:
+  case GenXIntrinsic::genx_sutrunc_sat:
+    return SIGNED;
+  case GenXIntrinsic::genx_ustrunc_sat:
+  case GenXIntrinsic::genx_uutrunc_sat:
+    return UNSIGNED;
+  default:
+    return DONTCARESIGNED;
+  }
+}
+
+static Signedness getISatSrcSign(Value *V) {
+  return getISatSrcSign(GenXIntrinsic::getGenXIntrinsicID(V));
+}
+
+static Signedness getISatDstSign(Value *V) {
+  return getISatDstSign(GenXIntrinsic::getGenXIntrinsicID(V));
+}
+
+void addKernelAttrsFromMetadata(VISAKernel &Kernel, const KernelMetadata &KM,
+                                const GenXSubtarget* Subtarget) {
+  unsigned Val = KM.getSLMSize();
+  if (Val) {
+    // Compute the slm size in KB and roundup to power of 2.
+    Val = alignTo(Val, 1024) / 1024;
+    if (!isPowerOf2_64(Val))
+      Val = NextPowerOf2(Val);
+    unsigned MaxSLMSize = 64;
+    if (Val > MaxSLMSize)
+      report_fatal_error("slm size must not exceed 64KB");
+    else {
+      // For pre-SKL, valid values are {0, 4, 8, 16, 32, 64}.
+      // For SKL+, valid values are {0, 1, 2, 4, 8, 16, 32, 64}.
+      // FIXME: remove the following line for SKL+.
+      Val = (Val < 4) ? 4 : Val;
+      uint8_t SLMSize = static_cast<uint8_t>(Val);
+      Kernel.AddKernelAttribute("SLMSize", 1, &SLMSize);
+    }
+  }
+
+}
+
+// Legalize name for using as filename or in visa asm
+static std::string legalizeName(std::string Name) {
+  std::replace_if(Name.begin(), Name.end(),
+                  [](unsigned char c) { return (!isalnum(c) && c != '_'); },
+                  '_');
+  return Name;
+}
+
+std::string GenXKernelBuilder::buildAsmName() const {
+  assert(TheKernelMetadata.isKernel());
+  std::string AsmName;
+  auto UserAsmName = AsmNameOpt.getValue();
+  if (UserAsmName.empty()) {
+    AsmName = legalizeName(TheKernelMetadata.getName());
+  } else {
+    int idx = -1;
+    auto *KernelMDs =
+        FG->getModule()->getOrInsertNamedMetadata(genx::FunctionMD::GenXKernels);
+    unsigned E = KernelMDs->getNumOperands();
+    for (unsigned I = 0; I < E; ++I) {
+      MDNode *KernelMD = KernelMDs->getOperand(I);
+      StringRef KernelName =
+          cast<MDString>(KernelMD->getOperand(genx::KernelMDOp::Name).get())
+              ->getString();
+      if (KernelName == TheKernelMetadata.getName()) {
+        idx = I;
+        break;
+      }
+    }
+    assert(idx >= 0);
+    // Reverse kernel ASM names during codegen.
+    // This provides an option to match the old compiler's output.
+    if (ReverseKernels.getValue())
+      idx = E - idx - 1;
+    AsmName = (UserAsmName + llvm::Twine('_') + llvm::Twine(idx)).str();
+  }
+  return AsmName;
+}
+
+
+bool GenXKernelBuilder::run(std::string &KernelNameBuf) {
+  GrfByteSize = Subtarget ? Subtarget->getGRFWidth() : 32;
+  StackSurf = Subtarget ? Subtarget->stackSurface() : PREDEFINED_SURFACE_STACK;
+  StringRef Name = TheKernelMetadata.getName();
+  if (!Name.size()) {
+    // If it is not a kernel, or no metadata was found, then set the
+    // name to the IR name.
+    Name = FG->getHead()->getName();
+  }
+
+  // Cut kernel name to fit vISA name size
+  auto Size = (Name.size() > COMMON_ISA_MAX_FILENAME_LENGTH)
+                  ? (COMMON_ISA_MAX_FILENAME_LENGTH)
+                  : Name.size();
+  KernelNameBuf.insert(0, Name.begin(), Size);
+  KernelNameBuf[Size] = 0;
+  if (TheKernelMetadata.isKernel()) {
+    CisaBuilder->AddKernel(MainKernel, KernelNameBuf.c_str());
+    Kernel = static_cast<VISAFunction *>(MainKernel);
+    Func2Kern[FG->getHead()] = Kernel;
+  } else {
+    CisaBuilder->AddFunction(Kernel, KernelNameBuf.c_str());
+  }
+
+  assert(Kernel && "Kernel initialization failed!");
+  LLVM_DEBUG(dbgs() << "=== PROCESS KERNEL(" << TheKernelMetadata.getName()
+                    << ") ===\n");
+
+  assert(Subtarget);
+  addKernelAttrsFromMetadata(*Kernel, TheKernelMetadata, Subtarget);
+
+  bool NeedRetIP = false; // Need special return IP variable for FC.
+  if (TheKernelMetadata.isKernel()) {
+    // For a kernel, add an attribute for asm filename for the jitter.
+    std::string AsmName = buildAsmName();
+    StringRef AsmNameRef = AsmName;
+    CISA_CALL(Kernel->AddKernelAttribute("OutputAsmPath", AsmNameRef.size(),
+                                         AsmNameRef.begin()));
+
+    // Populate variable attributes if any.
+    unsigned Idx = 0;
+    bool IsComposable = false;
+    for (auto &Arg : FG->getHead()->args()) {
+      const char *Kind = nullptr;
+      switch (TheKernelMetadata.getArgInputOutputKind(Idx++)) {
+      default:
+        break;
+      case KernelMetadata::IO_INPUT:
+        Kind = "Input";
+        break;
+      case KernelMetadata::IO_OUTPUT:
+        Kind = "Output";
+        break;
+      case KernelMetadata::IO_INPUT_OUTPUT:
+        Kind = "Input_Output";
+        break;
+      }
+      if (Kind != nullptr) {
+        auto R = RegAlloc->getRegForValueUntyped(FG->getHead(), &Arg);
+        assert(R && R->Category == RegCategory::GENERAL);
+        R->addAttribute(addStringToPool(Kind), "");
+        IsComposable = true;
+      }
+    }
+    if (IsComposable)
+      CISA_CALL(Kernel->AddKernelAttribute("Composable", 0, ""));
+    if (HasCallable) {
+      CISA_CALL(Kernel->AddKernelAttribute("Caller", 0, ""));
+      NeedRetIP = true;
+    }
+    if (FG->getHead()->hasFnAttribute("CMCallable")) {
+      CISA_CALL(Kernel->AddKernelAttribute("Callable", 0, ""));
+      NeedRetIP = true;
+    }
+    if (FG->getHead()->hasFnAttribute("CMEntry")) {
+      CISA_CALL(Kernel->AddKernelAttribute("Entry", 0, ""));
+    }
+  }
+
+  if (NeedRetIP) {
+    // Ask RegAlloc to add a special variable RetIP.
+    RegAlloc->addRetIPArgument();
+    auto R = RegAlloc->getRetIPArgument();
+    R->NameStr = "RetIP";
+    R->addAttribute(addStringToPool("Input_Output"), "");
+  }
+
+  // Emit optimization hints if any.
+  emitOptimizationHints();
+
+  Func = FG->getHead();
+  // Build variables
+  buildVariables();
+
+  // Build input variables
+  buildInputs(FG->getHead(), NeedRetIP);
+
+  for (auto &F : *FG) {
+    Func = F;
+    if (F->hasFnAttribute(genx::FunctionMD::CMStackCall) ||
+        F->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly)) {
+      VISAFunction *stackFunc = nullptr;
+      CisaBuilder->AddFunction((VISAFunction *&)stackFunc, F->getName().data());
+      assert(stackFunc);
+      Func2Kern[F] = stackFunc;
+      Kernel = stackFunc;
+      buildVariables();
+      Kernel = static_cast<VISAFunction *>(MainKernel);
+    }
+  }
+
+  // Build instructions
+  buildInstructions();
+
+  // Reset Regalloc hook
+  RegAlloc->SetRegPushHook(nullptr, nullptr);
+
+  if (TheKernelMetadata.isKernel()) {
+    // For a kernel with no barrier instruction, add a NoBarrier attribute.
+    if (!HasBarrier)
+      CISA_CALL(Kernel->AddKernelAttribute("NoBarrier", 0, nullptr));
+  }
+
+  return false;
+}
+
+static bool PatchImpArgOffset(Function *F, const GenXSubtarget *ST,
+                              const KernelMetadata &KM) {
+  return false;
+}
+
+void GenXKernelBuilder::buildInputs(Function *F, bool NeedRetIP) {
+
+  assert(F->arg_size() == TheKernelMetadata.getNumArgs() &&
+         "Mismatch between metadata for kernel and number of args");
+
+  // Number of globals to be binded statically.
+  std::vector<std::pair<GlobalVariable *, int32_t>> Bindings;
+  Module *M = F->getParent();
+  for (auto &GV : M->getGlobalList()) {
+    int32_t Offset = 0;
+    GV.getAttribute(genx::FunctionMD::GenXByteOffset)
+        .getValueAsString()
+        .getAsInteger(0, Offset);
+    if (Offset > 0)
+      Bindings.emplace_back(&GV, Offset);
+  }
+  // Each argument.
+  unsigned Idx = 0;
+  bool PatchImpArgOff = PatchImpArgOffset(F, Subtarget, TheKernelMetadata);
+  for (auto i = F->arg_begin(), e = F->arg_end(); i != e; ++i, ++Idx) {
+    if (TheKernelMetadata.shouldSkipArg(Idx))
+      continue;
+    Argument *Arg = &*i;
+    Register *Reg = RegAlloc->getRegForValueUntyped(F, Arg);
+    assert(Reg);
+    uint8_t Kind = TheKernelMetadata.getArgKind(Idx);
+    uint16_t Offset;
+    if (!PatchImpArgOff) {
+      Offset = TheKernelMetadata.getArgOffset(Idx);
+    }
+    // Argument size in bytes.
+    auto &DL = F->getParent()->getDataLayout();
+    Type *Ty = Arg->getType();
+    uint16_t NumBytes = Ty->isPointerTy() ? DL.getPointerTypeSize(Ty)
+                                          : (Ty->getPrimitiveSizeInBits() / 8U);
+
+    switch (Kind & 0x7) {
+    case visa::VISA_INPUT_GENERAL:
+    case visa::VISA_INPUT_SAMPLER:
+    case visa::VISA_INPUT_SURFACE:
+      CISA_CALL(Kernel->CreateVISAImplicitInputVar(
+          Reg->GetVar<VISA_GenVar>(Kernel), Offset, NumBytes, Kind >> 3));
+      break;
+
+    default:
+      report_fatal_error("Unknown input category");
+      break;
+    }
+  }
+  // Add the special RetIP argument.
+  if (NeedRetIP) {
+    Register *Reg = RegAlloc->getRetIPArgument();
+    uint16_t Offset = (127 * GrfByteSize + 6 * 4); // r127.6
+    uint16_t NumBytes = (64 / 8);
+    CISA_CALL(Kernel->CreateVISAImplicitInputVar(Reg->GetVar<VISA_GenVar>(Kernel),
+                                                 Offset, NumBytes, 0));
+  }
+  // Add pseudo-input for global variables with offset attribute.
+  for (auto &Item : Bindings) {
+    // TODO: sanity check. No overlap with other inputs.
+    GlobalVariable *GV = Item.first;
+    uint16_t Offset = Item.second;
+    assert(Offset > 0);
+    uint16_t NumBytes = (GV->getValueType()->getPrimitiveSizeInBits() / 8U);
+    uint8_t Kind = KernelMetadata::IMP_PSEUDO_INPUT;
+    Register *Reg = RegAlloc->getRegForValueUntyped(F, GV);
+    CISA_CALL(Kernel->CreateVISAImplicitInputVar(Reg->GetVar<VISA_GenVar>(Kernel),
+                                                 Offset, NumBytes, Kind >> 3));
+  }
+}
+
+// FIXME: We should use NM by default once code quality issues are addressed
+// in vISA compiler.
+static bool setNoMaskByDefault(Function *F) {
+  for (auto &BB : F->getBasicBlockList())
+    if (GotoJoin::isGotoBlock(&BB))
+      return true;
+
+  // Check if this is subroutine call.
+  for (auto U : F->users()) {
+    if (auto CI = dyn_cast<CallInst>(U)) {
+      Function *G = CI->getParent()->getParent();
+      if (G == F)
+        return false;
+      if (setNoMaskByDefault(G))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+void GenXKernelBuilder::buildInstructions() {
+  for (auto It = FG->begin(), E = FG->end(); It != E; ++It) {
+    Func = *It;
+    LLVM_DEBUG(dbgs() << "Building IR for func " << Func->getName().data()
+                      << "\n");
+    NoMask = setNoMaskByDefault(Func);
+
+    if (Func->hasFnAttribute(genx::FunctionMD::CMGenXMain) ||
+        Func->hasFnAttribute(genx::FunctionMD::CMStackCall) ||
+        Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly)) {
+      KernFunc = Func;
+    } else {
+      KernFunc = FGA->getSubGroup(Func) ? FGA->getSubGroup(Func)->getHead()
+                                        : FGA->getGroup(Func)->getHead();
+    }
+    assert(KernFunc);
+    Kernel = Func2Kern.at(KernFunc);
+
+    unsigned LabelID = getOrCreateLabel(Func, LABEL_SUBROUTINE);
+    CISA_CALL(Kernel->AppendVISACFLabelInst(Labels[LabelID]));
+
+    beginFunction(Func);
+
+    // If a float control is specified, emit code to make that happen.
+    // Float control contains rounding mode, denorm behaviour and single
+    // precision float mode (ALT or IEEE) Relevant bits are already set as
+    // defined for VISA control reg in header definition on enums
+    if (Func->hasFnAttribute(genx::FunctionMD::CMFloatControl)) {
+      uint32_t FloatControl = 0;
+      Func->getFnAttribute(genx::FunctionMD::CMFloatControl)
+          .getValueAsString()
+          .getAsInteger(0, FloatControl);
+
+      // Clear current float control bits to known zero state
+      buildControlRegUpdate(CR_Mask, true);
+
+      // Set rounding mode to required state if that isn't zero
+      FloatControl &= CR_Mask;
+      if (FloatControl) {
+        if (FG->getHead() == Func)
+          DefaultFloatControl = FloatControl;
+        buildControlRegUpdate(FloatControl, false);
+      }
+    }
+
+    // Only output a label for the initial basic block if it is used from
+    // somewhere else.
+    bool NeedsLabel = !Func->front().use_empty();
+    for (Function::iterator fi = Func->begin(), fe = Func->end(); fi != fe;
+         ++fi) {
+      BasicBlock *BB = &*fi;
+      if (!NeedsLabel && BB != &Func->front()) {
+        NeedsLabel = !BB->getSinglePredecessor();
+        if (!NeedsLabel)
+          NeedsLabel = GotoJoin::isJoinLabel(BB);
+      }
+      if (NeedsLabel) {
+        unsigned LabelID = getOrCreateLabel(BB, LABEL_BLOCK);
+        CISA_CALL(Kernel->AppendVISACFLabelInst(Labels[LabelID]));
+      }
+      NeedsLabel = true;
+      for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be;
+           ++bi) {
+        Instruction *Inst = &*bi;
+        if (Inst->isTerminator()) {
+          // Before the terminator inst of a basic block, if there is a single
+          // successor and it is the header of a loop, for any vector of at
+          // least four GRFs with a phi node where our incoming value is
+          // undef, insert a lifetime.start here.
+          auto TI = cast<IGCLLVM::TerminatorInst>(Inst);
+          if (TI->getNumSuccessors() == 1) {
+            auto Succ = TI->getSuccessor(0);
+            if (getLoops(Succ->getParent())->isLoopHeader(Succ)) {
+              for (auto si = Succ->begin();; ++si) {
+                auto Phi = dyn_cast<PHINode>(&*si);
+                if (!Phi)
+                  break;
+                if (Phi->getType()->getPrimitiveSizeInBits() >=
+                        (GrfByteSize * 8) * 4 &&
+                    isa<UndefValue>(
+                        Phi->getIncomingValue(Phi->getBasicBlockIndex(BB))))
+                  addLifetimeStartInst(Phi);
+              }
+            }
+          }
+        }
+
+        // Build the instruction.
+        if (!Baling->isBaled(Inst)) {
+#if DUMP_VISA_INTSTRUCTIONS
+          errs() << *Inst << '\n';
+          auto CisaInstCount = Kernel->getvIsaInstCount();
+#endif
+          if (ReturnInst *RI = dyn_cast<ReturnInst>(Inst))
+            endFunction(Func, RI);
+          if (buildInstruction(Inst))
+            NeedsLabel = false;
+#if DUMP_VISA_INTSTRUCTIONS
+          VISAKernelImpl *KernelImpl = (VISAKernelImpl *)Kernel;
+          if (CisaInstCount != Kernel->getvIsaInstCount()) {
+            VISAKernel_format_provider fmt(KernelImpl);
+            auto It = KernelImpl->getInstructionListBegin(),
+                 ItEnd = KernelImpl->getInstructionListEnd();
+            for (int Idx = 0; It != ItEnd; ++It, ++Idx) {
+              if (Idx >= CisaInstCount + 1) {
+                errs() << printInstruction(&fmt, (*It)->getCISAInst(),
+                                           KernelImpl->getOptions())
+                       << "\n\n";
+              }
+            }
+          }
+#endif
+        }
+      }
+    }
+  }
+}
+
+bool GenXKernelBuilder::buildInstruction(Instruction *Inst) {
+  // Make the source location pending, so it is output as vISA FILE and LOC
+  // instructions next time an opcode is written.
+  const DebugLoc &DL = Inst->getDebugLoc();
+  if (DL) {
+    StringRef Filename = DL->getFilename();
+    if (Filename != "") {
+      PendingFilename = Filename;
+      PendingDirectory = DL->getDirectory();
+    }
+    PendingLine = DL.getLine();
+  }
+  // Process the bale that this is the head instruction of.
+  BaleInfo BI = Baling->getBaleInfo(Inst);
+
+  DstOpndDesc DstDesc;
+  if (BI.Type == BaleInfo::GSTORE) {
+    // Inst is a global variable store. It should be baled into a wrr
+    // instruction.
+    Bale B;
+    Baling->buildBale(Inst, &B);
+    // This is an identity bale; no code will be emitted.
+    if (isIdentityBale(B))
+      return false;
+
+    assert(BI.isOperandBaled(0));
+    DstDesc.GStore = Inst;
+    Inst = cast<Instruction>(Inst->getOperand(0));
+    BI = Baling->getBaleInfo(Inst);
+  }
+
+  if (BI.Type == BaleInfo::WRREGION || BI.Type == BaleInfo::WRPREDREGION ||
+      BI.Type == BaleInfo::WRPREDPREDREGION) {
+    // Inst is a wrregion or wrpredregion or wrpredpredregion.
+    DstDesc.WrRegion = Inst;
+    DstDesc.WrRegionBI = BI;
+    if (isa<UndefValue>(Inst->getOperand(0)) && !DstDesc.GStore) {
+      // This is a wrregion, probably a partial write, to an undef value.
+      // Write a lifetime start if appropriate to help the jitter's register
+      // allocator.
+      addWriteRegionLifetimeStartInst(DstDesc.WrRegion);
+    }
+    // See if it bales in the instruction
+    // that generates the subregion/element.  That is always operand 1.
+    enum { OperandNum = 1 };
+    if (!BI.isOperandBaled(OperandNum)) {
+      if (BI.Type == BaleInfo::WRPREDREGION) {
+        buildLoneWrPredRegion(DstDesc.WrRegion, DstDesc.WrRegionBI);
+      } else {
+        buildLoneWrRegion(DstDesc);
+      }
+      return false;
+    }
+    // Yes, source of wrregion is baled in.
+    Inst = cast<Instruction>(DstDesc.WrRegion->getOperand(OperandNum));
+    BI = Baling->getBaleInfo(Inst);
+  }
+  if (BI.Type == BaleInfo::FADDR) {
+    buildFunctionAddr(Inst, DstDesc);
+    return false;
+  }
+  unsigned Mod = 0;
+  if (BI.Type == BaleInfo::SATURATE) {
+    // Inst is a fp saturate. See if it bales in the instruction that
+    // generates the value to saturate. That is always operand 0. If
+    // not, just treat the saturate as a normal intrinsic.
+    if (BI.isOperandBaled(0)) {
+      Mod = MODIFIER_SAT;
+      Inst = cast<Instruction>(Inst->getOperand(0));
+      BI = Baling->getBaleInfo(Inst);
+    } else
+      BI.Type = BaleInfo::MAININST;
+  }
+  if (BI.Type == BaleInfo::CMPDST) {
+    // Dst of sel instruction is baled in.
+    Inst = cast<Instruction>(Inst->getOperand(0));
+    assert(isa<CmpInst>(Inst) && "Only bale sel into a cmp instruction");
+    BI = Baling->getBaleInfo(Inst);
+  }
+  switch (BI.Type) {
+  case BaleInfo::RDREGION:
+  case BaleInfo::ABSMOD:
+  case BaleInfo::NEGMOD:
+  case BaleInfo::NOTMOD:
+    // This is a rdregion or modifier not baled in to a main instruction
+    // (but possibly baled in to a wrregion or sat modifier).
+    buildLoneOperand(Inst, BI, Mod, DstDesc);
+    return false;
+  }
+  assert(BI.Type == BaleInfo::MAININST || BI.Type == BaleInfo::NOTP ||
+         BI.Type == BaleInfo::ZEXT || BI.Type == BaleInfo::SEXT);
+  return buildMainInst(Inst, BI, Mod, DstDesc);
+}
+
+VISA_PredVar *GenXKernelBuilder::createPredicateDeclFromSelect(
+    Instruction *SI, BaleInfo BI, VISA_PREDICATE_CONTROL &Control,
+    VISA_PREDICATE_STATE &State, VISA_EMask_Ctrl *MaskCtrl) {
+  *MaskCtrl = vISA_EMASK_M1_NM;
+  // Get the predicate (mask) operand, scanning through baled in
+  // all/any/not/rdpredregion and setting State and MaskCtrl
+  // appropriately.
+  Value *Mask = getPredicateOperand(SI, 0 /*selector operand in select*/, BI,
+                                    Control, State, MaskCtrl);
+  assert(!isa<Constant>(Mask));
+  // Variable predicate. Derive the predication field from any baled in
+  // all/any/not and the predicate register number.
+  Register *Reg = RegAlloc->getRegForValue(KernFunc, Mask);
+  assert(Reg && Reg->Category == RegCategory::PREDICATE);
+  if (NoMask)
+    *MaskCtrl |= vISA_EMASK_M1_NM;
+  return getPredicateVar(Reg);
+}
+
+VISA_PredOpnd *
+GenXKernelBuilder::createPredFromWrRegion(const DstOpndDesc &DstDesc) {
+  VISA_PredOpnd *result = nullptr;
+  Instruction *WrRegion = DstDesc.WrRegion;
+  if (WrRegion) {
+    // Get the predicate (mask) operand, scanning through baled in
+    // all/any/not/rdpredregion and setting PredField and MaskCtrl
+    // appropriately.
+    VISA_EMask_Ctrl MaskCtrl;
+    VISA_PREDICATE_CONTROL Control;
+    VISA_PREDICATE_STATE State;
+    Value *Mask =
+        getPredicateOperand(WrRegion, 7 /*mask operand in wrregion*/,
+                            DstDesc.WrRegionBI, Control, State, &MaskCtrl);
+    if (auto C = dyn_cast<Constant>(Mask)) {
+      (void)C;
+      assert(C->isAllOnesValue() && "wrregion mask or predication operand must "
+                                    "be constant 1 or not constant");
+    } else {
+      // Variable predicate. Derive the predication field from any baled in
+      // all/any/not and the predicate register number. If the predicate has
+      // not has a register allocated, it must be EM.
+      Register *Reg = RegAlloc->getRegForValueOrNull(KernFunc, Mask);
+      if (Reg) {
+        assert(Reg->Category == RegCategory::PREDICATE);
+        result = createPredOperand(getPredicateVar(Reg), State, Control);
+      }
+    }
+  }
+  return result;
+}
+
+/***********************************************************************
+ * createPred : create predication field from an instruction operand
+ *
+ * Enter:   Inst = the instruction (0 to write an "always true" pred field)
+ *          BI = BaleInfo for the instruction, so we can see if there is a
+ *                rdpredregion baled in to the mask
+ *          OperandNum = operand number in the instruction
+ *
+ * If the operand is not constant 1, then it must be a predicate register.
+ */
+VISA_PredOpnd *GenXKernelBuilder::createPred(Instruction *Inst, BaleInfo BI,
+                                             unsigned OperandNum) {
+  VISA_PredOpnd *ResultOperand = nullptr;
+  VISA_PREDICATE_CONTROL PredControl;
+  VISA_PREDICATE_STATE Inverse;
+  VISA_EMask_Ctrl MaskCtrl;
+  Value *Mask = getPredicateOperand(Inst, OperandNum, BI, PredControl, Inverse,
+                                    &MaskCtrl);
+  if (auto C = dyn_cast<Constant>(Mask)) {
+    (void)C;
+    assert(C->isAllOnesValue() && "wrregion mask or predication operand must "
+                                  "be constant 1 or not constant");
+  } else {
+    // Variable predicate. Derive the predication field from any baled in
+    // all/any/not and the predicate register number. If the predicate has not
+    // has a register allocated, it must be EM.
+    Register *Reg = RegAlloc->getRegForValueOrNull(KernFunc, Mask);
+    VISA_PredVar *PredVar = nullptr;
+    if (Reg) {
+      assert(Reg->Category == RegCategory::PREDICATE);
+      PredVar = getPredicateVar(Reg);
+    } else
+      return nullptr;
+    ResultOperand = createPredOperand(PredVar, Inverse, PredControl);
+  }
+  return ResultOperand;
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createState(Register *Reg, unsigned Offset,
+                                                bool IsDst) {
+  uint8_t Size = 0;
+  VISA_VectorOpnd *Op = nullptr;
+
+  switch (Reg->Category) {
+  case RegCategory::SURFACE:
+    CISA_CALL(Kernel->CreateVISAStateOperand(Op, Reg->GetVar<VISA_SurfaceVar>(Kernel),
+                                             Size, Offset, IsDst));
+    break;
+  case RegCategory::SAMPLER:
+    CISA_CALL(Kernel->CreateVISAStateOperand(Op, Reg->GetVar<VISA_SamplerVar>(Kernel),
+                                             Size, Offset, IsDst));
+    break;
+  default:
+    llvm_unreachable("unknown state operand");
+  }
+
+  return Op;
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createDestination(CisaVariable *Dest,
+                                                      genx::Signedness Signed,
+                                                      unsigned *Offset) {
+  Region R(VectorType::get(
+      IntegerType::get(Ctx, CISATypeTable[Dest->getType()].typeSize * CHAR_BIT),
+      Dest->getNumElements()));
+  if (Offset)
+    R.Offset = *Offset;
+  return createRegionOperand(&R, Dest->getGenVar(), Signed, 0, true);
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createDestination(Value *Dest,
+                                                      genx::Signedness Signed,
+                                                      unsigned *Offset) {
+  return createDestination(Dest, Signed, 0, DstOpndDesc(), nullptr, Offset);
+}
+
+VISA_VectorOpnd *
+GenXKernelBuilder::createDestination(Value *Dest, genx::Signedness Signed,
+                                     unsigned Mod, const DstOpndDesc &DstDesc,
+                                     Signedness *SignedRes, unsigned *Offset) {
+  assert(!Dest->getType()->isAggregateType() &&
+         "cannot create destination register of an aggregate type");
+  if (SignedRes)
+    *SignedRes = Signed;
+
+  Type *OverrideType = nullptr;
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(Dest)) {
+    if (!(isa<Constant>(BCI->getOperand(0))) &&
+        !(BCI->getType()->getScalarType()->isIntegerTy(1)) &&
+        (BCI->getOperand(0)->getType()->getScalarType()->isIntegerTy(1))) {
+      if (VectorType *VT = dyn_cast<VectorType>(Dest->getType())) {
+        unsigned int NumBits = VT->getNumElements() *
+                               VT->getElementType()->getPrimitiveSizeInBits();
+        OverrideType = IntegerType::get(BCI->getContext(), NumBits);
+      }
+    }
+  }
+
+  // Saturation can also change signedness.
+  if (!Dest->user_empty() && GenXIntrinsic::isIntegerSat(Dest->user_back())) {
+    Signed = getISatDstSign(Dest->user_back());
+  }
+
+  if (!DstDesc.WrRegion) {
+    if (Mod) {
+      // There is a sat modifier. Either it is an fp saturate, which is
+      // represented by its own intrinsic which this instruction is baled
+      // into, or it is an int saturate which always comes from this
+      // instruction's semantics. In the former case, use the value
+      // that is the result of the saturate. But only if this instruction
+      // itself is not the sat intrinsic.
+      if (Dest->getType()->getScalarType()->isFloatingPointTy() &&
+          GenXIntrinsic::getGenXIntrinsicID(Dest) != GenXIntrinsic::genx_sat)
+        Dest = cast<Instruction>(Dest->use_begin()->getUser());
+    }
+    if ((Mod & MODIFIER_SAT) != 0) {
+      // Similar for integer saturation.
+      if (Dest->getType()->getScalarType()->isIntegerTy() &&
+          !GenXIntrinsic::isIntegerSat(Dest) && GenXIntrinsic::isIntegerSat(Dest->user_back()))
+        Dest = cast<Instruction>(Dest->user_back());
+    }
+    Register *Reg = RegAlloc->getRegForValue(KernFunc, Dest, Signed, OverrideType);
+    if (SignedRes)
+      *SignedRes = RegAlloc->getSigned(Reg);
+    // Write the vISA general operand:
+    if (Reg->Category == RegCategory::GENERAL) {
+      Region DestR(Dest);
+      if (Offset)
+        DestR.Offset = *Offset;
+      return createRegionOperand(&DestR, Reg->GetVar<VISA_GenVar>(Kernel),
+                                 DONTCARESIGNED, Mod, true /*isDest*/);
+    } else {
+      assert(Reg->Category == RegCategory::SURFACE ||
+             Reg->Category == RegCategory::VME ||
+             Reg->Category == RegCategory::SAMPLER);
+
+      return createState(Reg, 0 /*Offset*/, true /*IsDst*/);
+    }
+  }
+  // We need to allow for the case that there is no register allocated if it
+  // is an indirected arg, and that is OK because the region is indirect so
+  // the vISA does not contain the base register.
+  Register *Reg;
+
+  Value *V = nullptr;
+  if (DstDesc.GStore) {
+    auto GV = getUnderlyingGlobalVariable(DstDesc.GStore->getOperand(1));
+    assert(GV && "out of sync");
+    if (OverrideType == nullptr)
+      OverrideType = DstDesc.GStore->getOperand(0)->getType();
+    Reg = RegAlloc->getRegForValue(KernFunc, GV, Signed, OverrideType);
+    V = GV;
+  } else {
+    V = DstDesc.WrRegion;
+    Reg = RegAlloc->getRegForValueOrNull(KernFunc, V, Signed, OverrideType);
+  }
+
+  assert(!Reg || Reg->Category == RegCategory::GENERAL ||
+         Reg->Category == RegCategory::SAMPLER ||
+         Reg->Category == RegCategory::SURFACE ||
+         Reg->Category == RegCategory::VME);
+
+  // Write the vISA general operand with region:
+  Region R(DstDesc.WrRegion, DstDesc.WrRegionBI);
+
+  if (SignedRes)
+    *SignedRes = RegAlloc->getSigned(Reg);
+
+  if (Reg && (Reg->Category == RegCategory::SAMPLER ||
+              Reg->Category == RegCategory::SURFACE ||
+              Reg->Category == RegCategory::VME)) {
+    return createState(Reg, R.Offset / R.ElementBytes, true /*IsDest*/);
+  } else {
+    auto Decl = Reg ? Reg->GetVar<VISA_GenVar>(Kernel) : nullptr;
+    return createRegionOperand(&R, Decl, Signed, Mod, true /*IsDest*/);
+  }
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createSourceOperand(
+    Instruction *Inst, Signedness Signed, unsigned OperandNum,
+    genx::BaleInfo BI, unsigned Mod, Signedness *SignedRes, unsigned MaxWidth) {
+  Value *V = Inst->getOperand(OperandNum);
+  return createSource(V, Signed, BI.isOperandBaled(OperandNum), Mod, SignedRes,
+                      MaxWidth);
+}
+
+VISA_PredOpnd *
+GenXKernelBuilder::createPredOperand(VISA_PredVar *PredVar,
+                                     VISA_PREDICATE_STATE State,
+                                     VISA_PREDICATE_CONTROL Control) {
+  VISA_PredOpnd *PredOperand = nullptr;
+  CISA_CALL(
+      Kernel->CreateVISAPredicateOperand(PredOperand, PredVar, State, Control));
+
+  return PredOperand;
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createCisaSrcOperand(
+    VISA_GenVar *Decl, VISA_Modifier Mod, unsigned VStride, unsigned Width,
+    unsigned HStride, unsigned ROffset, unsigned COffset) {
+  VISA_VectorOpnd *ResultOperand = nullptr;
+  CISA_CALL(Kernel->CreateVISASrcOperand(ResultOperand, Decl, Mod, VStride,
+                                         Width, HStride, ROffset, COffset));
+  return ResultOperand;
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createCisaDstOperand(VISA_GenVar *Decl,
+                                                         unsigned HStride,
+                                                         unsigned ROffset,
+                                                         unsigned COffset) {
+  VISA_VectorOpnd *ResultOperand = nullptr;
+  CISA_CALL(Kernel->CreateVISADstOperand(ResultOperand, Decl, HStride, ROffset,
+                                         COffset));
+  return ResultOperand;
+}
+
+/***********************************************************************
+ * createAddressOperand : create an address register operand
+ */
+VISA_VectorOpnd *GenXKernelBuilder::createAddressOperand(Value *V, bool IsDst) {
+  VISA_VectorOpnd *ResultOperand = nullptr;
+  Register *Reg = RegAlloc->getRegForValue(KernFunc, V, DONTCARESIGNED);
+  assert(Reg->Category == RegCategory::ADDRESS);
+  unsigned Width = 1;
+  if (VectorType *VT = dyn_cast<VectorType>(V->getType()))
+    Width = VT->getNumElements();
+  if (IsDst) {
+    CISA_CALL(Kernel->CreateVISAAddressDstOperand(
+        ResultOperand, Reg->GetVar<VISA_AddrVar>(Kernel), 0));
+  } else {
+    CISA_CALL(Kernel->CreateVISAAddressSrcOperand(
+        ResultOperand, Reg->GetVar<VISA_AddrVar>(Kernel), 0, Width));
+  }
+  return ResultOperand;
+}
+
+VISA_Type GenXKernelBuilder::getVISAImmTy(uint8_t ImmTy) {
+  return static_cast<VISA_Type>(ImmTy & 0xf);
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createImmediateOperand(Constant *V,
+                                                           Signedness Signed) {
+  if (isDerivedFromUndef(V))
+    V = Constant::getNullValue(V->getType());
+
+  Type *T = V->getType();
+  if (VectorType *VT = dyn_cast<VectorType>(T)) {
+    // Vector constant.
+    auto Splat = V->getSplatValue();
+    if (!Splat) {
+      // Non-splatted vector constant. Must be a packed vector.
+      unsigned NumElements = VT->getNumElements();
+      if (VT->getElementType()->isIntegerTy()) {
+        // Packed int vector.
+        assert(NumElements <= ImmIntVec::Width);
+        unsigned Packed = 0;
+        for (unsigned i = 0; i != NumElements; ++i) {
+          auto El = dyn_cast<ConstantInt>(V->getAggregateElement(i));
+          if (!El)
+            continue; // undef element
+          int This = El->getSExtValue();
+          if (This < ImmIntVec::MinUInt) {
+            assert(This >= ImmIntVec::MinSInt &&
+                "too big imm, cannot encode as vector imm");
+            Signed = SIGNED;
+          } else if (This > ImmIntVec::MaxSInt) {
+            assert(This <= ImmIntVec::MaxUInt &&
+                "too big imm, cannot encode as vector imm");
+            Signed = UNSIGNED;
+          }
+          Packed |= (This & ImmIntVec::MaxUInt) << (ImmIntVec::ElemSize * i);
+        }
+        // For a 2- or 4-wide operand, we need to repeat the vector elements
+        // as which ones are used depends on the position of the other
+        // operand in its oword.
+        switch (NumElements) {
+        case 2:
+          Packed = Packed * 0x01010101;
+          break;
+        case 4:
+          Packed = Packed * 0x00010001;
+          break;
+        }
+        auto ImmTy =
+            static_cast<uint8_t>(Signed == UNSIGNED ? ISA_TYPE_UV : ISA_TYPE_V);
+        auto VISAImmTy = getVISAImmTy(ImmTy);
+        VISA_VectorOpnd *ImmOp = nullptr;
+        CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Packed, VISAImmTy));
+        return ImmOp;
+      }
+      // Packed float vector.
+      assert(VT->getElementType()->isFloatTy() &&
+             (NumElements == 1 || NumElements == 2 || NumElements == 4));
+      unsigned Packed = 0;
+      for (unsigned i = 0; i != 4; ++i) {
+        auto CFP =
+            dyn_cast<ConstantFP>(V->getAggregateElement(i % NumElements));
+        if (!CFP) // Undef
+          continue;
+        const APFloat &FP = CFP->getValueAPF();
+        Packed |= get8bitPackedFloat(FP.convertToFloat()) << (i * 8);
+      }
+      auto VISAImmTy = getVISAImmTy(ISA_TYPE_VF);
+      VISA_VectorOpnd *ImmOp = nullptr;
+      CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Packed, VISAImmTy));
+      return ImmOp;
+    }
+    // Splatted (or single element) vector. Use the scalar value.
+    T = VT->getElementType();
+    V = Splat;
+  }
+
+  if (isDerivedFromUndef(V))
+    V = Constant::getNullValue(V->getType());
+  else if (isa<ConstantPointerNull>(V)) {
+    const DataLayout &DL = Func->getParent()->getDataLayout();
+    T = DL.getIntPtrType(V->getType());
+    V = Constant::getNullValue(T);
+  }
+
+  // We have a scalar constant.
+  if (IntegerType *IT = dyn_cast<IntegerType>(T)) {
+    ConstantInt *CI = cast<ConstantInt>(V);
+    // I think we need to use the appropriate one of getZExtValue or
+    // getSExtValue to avoid an assert on very large 64 bit values...
+    int64_t Val = Signed == UNSIGNED ? CI->getZExtValue() : CI->getSExtValue();
+    visa::TypeDetails TD(Func->getParent()->getDataLayout(), IT, Signed);
+    VISA_VectorOpnd *ImmOp = nullptr;
+    CISA_CALL(
+        Kernel->CreateVISAImmediate(ImmOp, &Val, getVISAImmTy(TD.VisaType)));
+    return ImmOp;
+  } if (isa<Function>(V)) {
+    assert(0 && "Not baled function address");
+    return nullptr;
+  } else {
+    VISA_VectorOpnd *ImmOp = nullptr;
+    ConstantFP *CF = cast<ConstantFP>(V);
+    if (T->isFloatTy()) {
+      union {
+        float f;
+        uint32_t i;
+      } Val;
+      Val.f = CF->getValueAPF().convertToFloat();
+      auto VISAImmTy = getVISAImmTy(ISA_TYPE_F);
+      CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Val.i, VISAImmTy));
+    } else if (T->isHalfTy()) {
+      uint16_t Val(
+          (uint16_t)(CF->getValueAPF().bitcastToAPInt().getZExtValue()));
+      auto VISAImmTy = getVISAImmTy(ISA_TYPE_HF);
+      auto Val32 = static_cast<uint32_t>(Val);
+      CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Val32, VISAImmTy));
+    } else {
+      assert(T->isDoubleTy());
+      union {
+        double f;
+        uint64_t i;
+      } Val;
+      Val.f = CF->getValueAPF().convertToDouble();
+      auto VISAImmTy = getVISAImmTy(ISA_TYPE_DF);
+      CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Val.i, VISAImmTy));
+    }
+    return ImmOp;
+  }
+}
+
+/***********************************************************************
+ * getOriginalInstructionForSource : trace a source operand back through
+ *     its bale (if any), given a starting instruction.
+ *
+ * Enter:   Inst = The instruction to start tracing from.
+ *          BI = BaleInfo for Inst
+ */
+Instruction *
+GenXKernelBuilder::getOriginalInstructionForSource(Instruction *Inst,
+                                                   BaleInfo BI) {
+  while (!isa<Constant>(Inst->getOperand(0)) && BI.isOperandBaled(0)) {
+    Inst = cast<Instruction>(Inst->getOperand(0));
+    BI = Baling->getBaleInfo(Inst);
+  }
+
+  return Inst;
+}
+
+void GenXKernelBuilder::buildConvert(CallInst *CI, BaleInfo BI, unsigned Mod,
+                                     const DstOpndDesc &DstDesc) {
+  Register *DstReg = RegAlloc->getRegForValue(KernFunc, CI, UNSIGNED);
+  if (!isa<Constant>(CI->getOperand(0))) {
+    Instruction *OrigInst = getOriginalInstructionForSource(CI, BI);
+    Register *SrcReg = RegAlloc->getRegForValue(KernFunc, OrigInst->getOperand(0));
+    (void)SrcReg;
+    assert((SrcReg->Category != RegCategory::GENERAL ||
+            DstReg->Category != RegCategory::GENERAL) &&
+           "expected a category conversion");
+  }
+
+  if (DstReg->Category != RegCategory::ADDRESS) {
+    // State copy.
+    int ExecSize = 1;
+    if (VectorType *VT = dyn_cast<VectorType>(CI->getType())) {
+      ExecSize = VT->getNumElements();
+    }
+
+    auto ISAExecSize = static_cast<VISA_Exec_Size>(genx::log2(ExecSize));
+    auto Dst = createDestination(CI, UNSIGNED, 0, DstDesc);
+    auto Src = createSourceOperand(CI, UNSIGNED, 0, BI);
+    addDebugInfo();
+    CISA_CALL(Kernel->AppendVISADataMovementInst(
+        ISA_MOVS, nullptr /*Pred*/, false /*Mod*/,
+        NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1, ISAExecSize, Dst, Src));
+    return;
+  }
+
+  // Destination is address register.
+  int ExecSize = 1;
+  if (VectorType *VT = dyn_cast<VectorType>(CI->getType())) {
+    report_fatal_error("vector of addresses not implemented");
+  }
+
+  auto ISAExecSize = static_cast<VISA_Exec_Size>(genx::log2(ExecSize));
+  Register *SrcReg = RegAlloc->getRegForValue(KernFunc, CI->getOperand(0));
+  assert(SrcReg->Category == RegCategory::ADDRESS);
+
+  (void)SrcReg;
+  // This is an address->address copy, inserted due to coalescing failure of
+  // the address for an indirected arg in GenXArgIndirection.
+  // (A conversion to address is handled in buildConvertAddr below.)
+  // Write the addr_add instruction.
+  Value *SrcOp0 = CI->getOperand(0);
+  unsigned Src0Width = 1;
+  if (VectorType *VT = dyn_cast<VectorType>(SrcOp0->getType()))
+    Src0Width = VT->getNumElements();
+
+  Register *RegDst = RegAlloc->getRegForValue(KernFunc, CI, DONTCARESIGNED);
+  Register *RegSrc0 = RegAlloc->getRegForValue(KernFunc, SrcOp0, DONTCARESIGNED);
+
+  VISA_VectorOpnd *Dst = nullptr, *Src0 = nullptr, *Src1 = nullptr;
+
+  CISA_CALL(Kernel->CreateVISAAddressDstOperand(
+      Dst, RegDst->GetVar<VISA_AddrVar>(Kernel), 0));
+  CISA_CALL(Kernel->CreateVISAAddressSrcOperand(
+      Src0, RegSrc0->GetVar<VISA_AddrVar>(Kernel), 0, Src0Width));
+  Src1 =
+      createImmediateOperand(Constant::getNullValue(CI->getType()), UNSIGNED);
+
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISAAddrAddInst(vISA_EMASK_M1_NM, ISAExecSize, Dst,
+                                          Src0, Src1));
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createSource(CisaVariable *V,
+                                                 Signedness Signed,
+                                                 unsigned MaxWidth,
+                                                 unsigned *Offset) {
+  Region R(VectorType::get(
+      IntegerType::get(Ctx, CISATypeTable[V->getType()].typeSize * CHAR_BIT),
+      V->getNumElements()));
+  if (Offset)
+    R.Offset = *Offset;
+  return createRegionOperand(&R, V->getGenVar(), Signed, 0, false, MaxWidth);
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createSource(Value *V, Signedness Signed,
+                                                 unsigned MaxWidth,
+                                                 unsigned *Offset) {
+  return createSource(V, Signed, false, 0, nullptr, MaxWidth, Offset);
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createSource(Value *V, Signedness Signed,
+                                                 bool Baled, unsigned Mod,
+                                                 Signedness *SignedRes,
+                                                 unsigned MaxWidth,
+                                                 unsigned *Offset) {
+  if (SignedRes)
+    *SignedRes = Signed;
+  if (auto C = dyn_cast<Constant>(V)) {
+    if (Mod) {
+      // Need to negate constant.
+      assert(Mod == MODIFIER_NEG && "unexpected modifier");
+      if (C->getType()->isIntOrIntVectorTy())
+        C = ConstantExpr::getNeg(C);
+      else
+        C = ConstantExpr::getFNeg(C);
+    }
+    return createImmediateOperand(C, Signed);
+  }
+  if (!Baled) {
+    Register *Reg = RegAlloc->getRegForValue(KernFunc, V, Signed);
+    assert(Reg->Category == RegCategory::GENERAL ||
+           Reg->Category == RegCategory::SURFACE ||
+           Reg->Category == RegCategory::SAMPLER ||
+           Reg->Category == RegCategory::VME);
+    // Write the vISA general operand.
+    Region R(V);
+    if (Offset)
+      R.Offset = *Offset;
+    if (R.NumElements == 1)
+      R.VStride = R.Stride = 0;
+    if (SignedRes)
+      *SignedRes = RegAlloc->getSigned(Reg);
+    if (Reg->Category == RegCategory::GENERAL) {
+      return createRegionOperand(&R, Reg->GetVar<VISA_GenVar>(Kernel), Signed, Mod,
+                                 false /*IsDst*/, MaxWidth);
+    } else {
+      return createState(Reg, R.Offset >> 2, false /*IsDst*/);
+    };
+  }
+
+  Instruction *Inst = cast<Instruction>(V);
+  BaleInfo BI(Baling->getBaleInfo(Inst));
+  unsigned Idx = 0;
+  switch (BI.Type) {
+  case BaleInfo::RDREGION: {
+    // The source operand has a rdregion baled in. We need to allow for the
+    // case that there is no register allocated if it is an indirected arg,
+    // and that is OK because the region is indirect so the vISA does not
+    // contain the base register.
+    Value *V = Inst->getOperand(0);
+    Register *Reg = RegAlloc->getRegForValueOrNull(KernFunc, V, Signed);
+
+    // Ensure we pick a non-DONTCARESIGNED signedness here, as, for an
+    // indirect region and DONTCARESIGNED, writeRegion arbitrarily picks a
+    // signedness as it is attached to the operand, unlike a direct region
+    // where it is attached to the vISA register.
+    if (Reg)
+      Signed = RegAlloc->getSigned(Reg);
+    else if (Signed == DONTCARESIGNED)
+      Signed = SIGNED;
+    // Write the vISA general operand with region.
+    Region R(Inst, Baling->getBaleInfo(Inst));
+    if (Offset)
+      R.Offset = *Offset;
+    if (R.NumElements == 1)
+      R.VStride = 0;
+    if (R.Width == 1)
+      R.Stride = 0;
+    if (!Reg || Reg->Category == RegCategory::GENERAL || R.Indirect) {
+      if (SignedRes)
+        *SignedRes = Signed;
+      return createRegionOperand(&R, Reg ? Reg->GetVar<VISA_GenVar>(Kernel) : nullptr,
+                                 Signed, Mod, false, MaxWidth);
+    } else {
+      if (SignedRes)
+        *SignedRes = Signed;
+      return createState(Reg, R.Offset >> 2, false /*IsDst*/);
+    }
+  }
+  case BaleInfo::ABSMOD:
+    Signed = SIGNED;
+    Mod |= MODIFIER_ABS;
+    break;
+  case BaleInfo::NEGMOD:
+    if (!(Mod & MODIFIER_ABS))
+      Mod ^= MODIFIER_NEG;
+    Idx = 1; // the input we want in "0-x" is x, not 0.
+    break;
+  case BaleInfo::NOTMOD:
+    Mod ^= MODIFIER_NOT;
+    break;
+  case BaleInfo::ZEXT:
+    Signed = UNSIGNED;
+    break;
+  case BaleInfo::SEXT:
+    Signed = SIGNED;
+    break;
+  default:
+    llvm_unreachable("unknown bale type");
+    break;
+  }
+  return createSource(Inst->getOperand(Idx), Signed, BI.isOperandBaled(Idx),
+                      Mod, SignedRes, MaxWidth);
+}
+
+std::string GenXKernelBuilder::createInlineAsmOperand(
+    Register *Reg, genx::Region *R, bool IsDst, genx::Signedness Signed,
+    genx::ConstraintType Ty, unsigned Mod) {
+  deduceRegion(R, IsDst);
+
+  VISA_VectorOpnd *ResultOperand = nullptr;
+  switch (Ty) {
+  default:
+    llvm_unreachable("constraint unhandled");
+  case ConstraintType::Constraint_cr: {
+    assert(Reg && Reg->Category == RegCategory::PREDICATE);
+    VISA_PredVar *PredVar = getPredicateVar(Reg);
+    VISA_PredOpnd *PredOperand =
+        createPredOperand(PredVar, PredState_NO_INVERSE, PRED_CTRL_NON);
+    return Kernel->getPredicateOperandName(PredOperand);
+  }
+  case ConstraintType::Constraint_rw:
+    return Kernel->getVarName(Reg->GetVar<VISA_GenVar>(Kernel));
+  case ConstraintType::Constraint_r:
+    ResultOperand =
+        createGeneralOperand(R, Reg->GetVar<VISA_GenVar>(Kernel), Signed, Mod, IsDst);
+    break;
+  case ConstraintType::Constraint_a:
+    if (!R->Indirect)
+      report_fatal_error("Inline asm operand can'be indirected here");
+    ResultOperand = createIndirectOperand(R, Signed, Mod, IsDst);
+    break;
+  }
+  return Kernel->getVectorOperandName(ResultOperand, true);
+}
+
+std::string GenXKernelBuilder::createInlineAsmDestinationOperand(
+    Value *Dest, genx::Signedness Signed, genx::ConstraintType Ty, unsigned Mod,
+    const DstOpndDesc &DstDesc) {
+
+  Type *OverrideType = nullptr;
+
+  // Saturation can also change signedness.
+  if (!Dest->user_empty() && GenXIntrinsic::isIntegerSat(Dest->user_back())) {
+    Signed = getISatDstSign(Dest->user_back());
+  }
+
+  if (!DstDesc.WrRegion) {
+    Register *Reg = RegAlloc->getRegForValue(KernFunc, Dest, Signed, OverrideType);
+
+    Region DestR(Dest);
+    return createInlineAsmOperand(Reg, &DestR, true /*IsDst*/, DONTCARESIGNED,
+                                  Ty, Mod);
+  }
+  // We need to allow for the case that there is no register allocated if it is
+  // an indirected arg, and that is OK because the region is indirect so the
+  // vISA does not contain the base register.
+  Register *Reg;
+
+  Value *V = nullptr;
+  if (DstDesc.GStore) {
+    auto GV = getUnderlyingGlobalVariable(DstDesc.GStore->getOperand(1));
+    assert(GV && "out of sync");
+    if (OverrideType == nullptr)
+      OverrideType = DstDesc.GStore->getOperand(0)->getType();
+    Reg = RegAlloc->getRegForValue(KernFunc, GV, Signed, OverrideType);
+    V = GV;
+  } else {
+    V = DstDesc.WrRegion;
+    Reg = RegAlloc->getRegForValueOrNull(KernFunc, V, Signed, OverrideType);
+  }
+
+  assert(!Reg || Reg->Category == RegCategory::GENERAL);
+
+  // Write the vISA general operand with region:
+  Region R(DstDesc.WrRegion, DstDesc.WrRegionBI);
+
+  return createInlineAsmOperand(Reg, &R, true /*IsDst*/, Signed, Ty, Mod);
+}
+
+std::string GenXKernelBuilder::createInlineAsmSourceOperand(
+    Value *V, genx::Signedness Signed, bool Baled, genx::ConstraintType Ty,
+    unsigned Mod, unsigned MaxWidth) {
+
+  if (auto C = dyn_cast<Constant>(V)) {
+    if (Ty != genx::ConstraintType::Constraint_n) {
+      if (Mod) {
+        // Need to negate constant.
+        assert(Mod == MODIFIER_NEG && "unexpected modifier");
+        if (C->getType()->isIntOrIntVectorTy())
+          C = ConstantExpr::getNeg(C);
+        else
+          C = ConstantExpr::getFNeg(C);
+      }
+      VISA_VectorOpnd *ImmOp = createImmediateOperand(C, Signed);
+      return Kernel->getVectorOperandName(ImmOp, false);
+    } else {
+      ConstantInt *CI = cast<ConstantInt>(C);
+      return llvm::to_string(CI->getSExtValue());
+    }
+  }
+
+  if (!Baled) {
+    Register *Reg = RegAlloc->getRegForValue(KernFunc, V, Signed);
+    Region R(V);
+    if (R.NumElements == 1)
+      R.VStride = R.Stride = 0;
+
+    return createInlineAsmOperand(Reg, &R, false /*IsDst*/, Signed, Ty, Mod);
+  }
+
+  Instruction *Inst = cast<Instruction>(V);
+  BaleInfo BI(Baling->getBaleInfo(Inst));
+  assert(BI.Type == BaleInfo::RDREGION);
+  // The source operand has a rdregion baled in. We need to allow for the
+  // case that there is no register allocated if it is an indirected arg,
+  // and that is OK because the region is indirect so the vISA does not
+  // contain the base register.
+  V = Inst->getOperand(0);
+  Register *Reg = RegAlloc->getRegForValue(KernFunc, V, Signed);
+
+  // Ensure we pick a non-DONTCARESIGNED signedness here, as, for an
+  // indirect region and DONTCARESIGNED, writeRegion arbitrarily picks a
+  // signedness as it is attached to the operand, unlike a direct region
+  // where it is attached to the vISA register.
+  if (Signed == DONTCARESIGNED)
+    Signed = SIGNED;
+  // Write the vISA general operand with region.
+  Region R(Inst, Baling->getBaleInfo(Inst));
+  if (R.NumElements == 1)
+    R.VStride = 0;
+  if (R.Width == 1)
+    R.Stride = 0;
+
+  assert(Reg->Category == RegCategory::GENERAL || R.Indirect);
+
+  return createInlineAsmOperand(Reg, &R, false /*IsDst*/, Signed, Ty, Mod);
+}
+
+/***********************************************************************
+ * getPredicateVar : get predicate var from value
+ */
+VISA_PredVar *GenXKernelBuilder::getPredicateVar(Value *V) {
+  auto Reg = RegAlloc->getRegForValue(KernFunc, V, DONTCARESIGNED);
+  assert(Reg && Reg->Category == RegCategory::PREDICATE);
+  return getPredicateVar(Reg);
+}
+
+/***********************************************************************
+ * getZeroedPredicateVar : get predicate var from value with zeroing it
+ */
+VISA_PredVar *GenXKernelBuilder::getZeroedPredicateVar(Value *V) {
+  auto Reg = RegAlloc->getRegForValue(KernFunc, V, DONTCARESIGNED);
+  assert(Reg && Reg->Category == RegCategory::PREDICATE);
+  auto PredVar = getPredicateVar(Reg);
+  unsigned Size = V->getType()->getPrimitiveSizeInBits();
+  auto C = Constant::getNullValue(V->getType());
+  CISA_CALL(Kernel->AppendVISASetP(
+    vISA_EMASK_M1_NM, VISA_Exec_Size(genx::log2(Size)),
+    PredVar, createImmediateOperand(C, DONTCARESIGNED)));
+
+  return PredVar;
+}
+
+/***********************************************************************
+ * getPredicateVar : get predicate var from register
+ */
+VISA_PredVar *GenXKernelBuilder::getPredicateVar(Register *R) {
+  assert(R);
+  return R->Num >= visa::VISA_NUM_RESERVED_PREDICATES
+             ? R->GetVar<VISA_PredVar>(Kernel)
+             : nullptr;
+}
+
+void GenXKernelBuilder::buildSelectInst(SelectInst *SI, BaleInfo BI,
+                                        unsigned Mod,
+                                        const DstOpndDesc &DstDesc) {
+  unsigned ExecSize = 1;
+  if (VectorType *VT = dyn_cast<VectorType>(SI->getType()))
+    ExecSize = VT->getNumElements();
+  // Get the predicate (mask) operand, scanning through baled in
+  // all/any/not/rdpredregion and setting PredField and MaskCtrl
+  // appropriately.
+  VISA_EMask_Ctrl MaskCtrl;
+  VISA_PREDICATE_CONTROL Control;
+  VISA_PREDICATE_STATE State;
+
+  VISA_PredVar *PredDecl =
+      createPredicateDeclFromSelect(SI, BI, Control, State, &MaskCtrl);
+  VISA_PredOpnd* PredOp = createPredOperand(PredDecl, State, Control);
+
+  VISA_VectorOpnd *Dst = createDestination(SI, DONTCARESIGNED, Mod, DstDesc);
+  VISA_VectorOpnd *Src0 = createSourceOperand(SI, DONTCARESIGNED, 1, BI);
+  VISA_VectorOpnd *Src1 = createSourceOperand(SI, DONTCARESIGNED, 2, BI);
+
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISADataMovementInst(
+      ISA_SEL, PredOp, Mod & MODIFIER_SAT, MaskCtrl,
+      getExecSizeFromValue(ExecSize), Dst, Src0, Src1));
+}
+
+void GenXKernelBuilder::buildBitCast(CastInst *CI, genx::BaleInfo BI,
+                                     unsigned Mod, const DstOpndDesc &DstDesc) {
+  if (!isMaskPacking(CI))
+    assert(!BI.Bits && !Mod && !DstDesc.WrRegion &&
+           "non predicate bitcast should not be baled with anything");
+
+  if (CI->getType()->getScalarType()->isIntegerTy(1)) {
+    if (CI->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) {
+      if (auto C = dyn_cast<Constant>(CI->getOperand(0))) {
+        auto Reg = RegAlloc->getRegForValueOrNull(KernFunc, CI, DONTCARESIGNED);
+        if (!Reg)
+          return; // write to EM/RM value, ignore
+        // We can move a constant predicate to a predicate register
+        // using setp, if we get the constant predicate as a single int.
+        unsigned IntVal = getPredicateConstantAsInt(C);
+        unsigned Size = C->getType()->getPrimitiveSizeInBits();
+        C = ConstantInt::get(
+            Type::getIntNTy(CI->getContext(), std::max(Size, 8U)), IntVal);
+
+        addDebugInfo();
+        CISA_CALL(Kernel->AppendVISASetP(
+            vISA_EMASK_M1_NM, VISA_Exec_Size(genx::log2(Size)),
+            getPredicateVar(Reg), createSourceOperand(CI, UNSIGNED, 0, BI)));
+        return;
+      }
+      // There does not appear to be a vISA instruction to move predicate
+      // to predicate. GenXCoalescing avoids this by moving in two steps
+      // via a general register. So the only pred->pred bitcast that arrives
+      // here should be one from GenXLowering, and it should have been copy
+      // coalesced in GenXCoalescing.
+      assert(RegAlloc->getRegForValue(KernFunc, CI, DONTCARESIGNED) ==
+                 RegAlloc->getRegForValue(KernFunc, CI->getOperand(0), DONTCARESIGNED) &&
+             "uncoalesced phi move of predicate");
+      return;
+    }
+
+    VISA_PredVar *PredVar = getPredicateVar(CI);
+
+    addDebugInfo();
+    CISA_CALL(Kernel->AppendVISASetP(
+        vISA_EMASK_M1_NM,
+        VISA_Exec_Size(
+            genx::log2(CI->getType()->getPrimitiveSizeInBits())),
+        PredVar, createSourceOperand(CI, UNSIGNED, 0, BI)));
+    return;
+  }
+  if (isa<Constant>(CI->getOperand(0))) {
+    if (isa<UndefValue>(CI->getOperand(0)))
+      return; // undef source, generate no code
+    // Source is constant.
+    int ExecSize = 1;
+    if (VectorType *VT = dyn_cast<VectorType>(CI->getType()))
+      ExecSize = VT->getNumElements();
+
+    VISA_EMask_Ctrl ctrlMask = getExecMaskFromWrRegion(DstDesc, true);
+    VISA_Exec_Size execSize = getExecSizeFromValue(ExecSize);
+    addDebugInfo();
+    CISA_CALL(Kernel->AppendVISADataMovementInst(
+        ISA_MOV, createPredFromWrRegion(DstDesc), Mod & MODIFIER_SAT, ctrlMask,
+        execSize, createDestination(CI, DONTCARESIGNED, Mod, DstDesc),
+        createSourceOperand(CI, DONTCARESIGNED, 0, BI)));
+    return;
+  }
+  if (CI->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) {
+    // Bitcast from predicate to scalar int
+    Register *PredReg =
+        RegAlloc->getRegForValue(KernFunc, CI->getOperand(0), DONTCARESIGNED);
+    assert(PredReg->Category == RegCategory::PREDICATE);
+    addDebugInfo();
+    CISA_CALL(Kernel->AppendVISAPredicateMove(
+        createDestination(CI, UNSIGNED, 0, DstDesc),
+        PredReg->GetVar<VISA_PredVar>(Kernel)));
+
+    return;
+  }
+
+  // Real bitcast with possibly different types. Use whichever type has the
+  // largest element size, so we minimize the number of channels used in the
+  // move.
+  Type *Ty = CI->getOperand(0)->getType();
+  if (Ty->getScalarType()->getPrimitiveSizeInBits() <
+      CI->getType()->getScalarType()->getPrimitiveSizeInBits())
+    Ty = CI->getType();
+  if (Liveness->isBitCastCoalesced(cast<BitCastInst>(CI)))
+    return; // bitcast was coalesced away
+  Register *DstReg = RegAlloc->getRegForValue(KernFunc, CI, DONTCARESIGNED, Ty);
+  // Give dest and source the same signedness for byte mov.
+  auto Signed = RegAlloc->getSigned(DstReg);
+  Register *SrcReg = RegAlloc->getRegForValue(KernFunc, CI->getOperand(0), Signed, Ty);
+  VISA_Exec_Size ExecSize = EXEC_SIZE_1;
+  if (VectorType *VT = dyn_cast<VectorType>(Ty))
+    ExecSize = getExecSizeFromValue(VT->getNumElements());
+  assert(ExecSize >= EXEC_SIZE_1 && ExecSize <= EXEC_SIZE_32 &&
+         "illegal exec size in bitcast: should have been coalesced away");
+  // destination
+  Region DestR(CI);
+  // source
+  Region SourceR(CI->getOperand(0));
+
+  VISA_EMask_Ctrl ctrlMask = NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISADataMovementInst(
+      ISA_MOV, nullptr, Mod, ctrlMask, ExecSize,
+      createRegionOperand(&DestR, DstReg->GetVar<VISA_GenVar>(Kernel), DONTCARESIGNED,
+                          0, true),
+      createRegionOperand(&SourceR, SrcReg->GetVar<VISA_GenVar>(Kernel), Signed, 0,
+                          false)));
+}
+
+void GenXKernelBuilder::buildFunctionAddr(Instruction *Inst,
+                                          const DstOpndDesc &DstDesc) {
+
+  auto *Dst = createDestination(Inst, DONTCARESIGNED, MODIFIER_NONE, DstDesc);
+  assert(Dst);
+  auto *F = cast<Function>(cast<PtrToIntInst>(Inst)->getPointerOperand());
+  CISA_CALL(Kernel->AppendVISACFSymbolInst(F->getName(), Dst));
+}
+
+/***********************************************************************
+ * buildLoneWrRegion : build a lone wrregion
+ */
+void GenXKernelBuilder::buildLoneWrRegion(const DstOpndDesc &DstDesc) {
+  enum { OperandNum = 1 };
+  Value *Input = DstDesc.WrRegion->getOperand(OperandNum);
+  if (isa<UndefValue>(Input))
+    return; // No code if input is undef
+  VISA_Exec_Size ExecSize = EXEC_SIZE_1;
+  if (VectorType *VT = dyn_cast<VectorType>(Input->getType()))
+    ExecSize = getExecSizeFromValue(VT->getNumElements());
+
+  VISA_EMask_Ctrl ExecMask = getExecMaskFromWrRegion(DstDesc, true);
+
+  // TODO: fix signedness of the source
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISADataMovementInst(
+      ISA_MOV, createPredFromWrRegion(DstDesc), false, ExecMask, ExecSize,
+      createDestination(Input, DONTCARESIGNED, 0, DstDesc),
+      createSource(Input, DONTCARESIGNED, false, 0)));
+}
+
+/***********************************************************************
+ * buildLoneWrPredRegion : build a lone wrpredregion
+ */
+void GenXKernelBuilder::buildLoneWrPredRegion(Instruction *Inst, BaleInfo BI) {
+  enum { OperandNum = 1 };
+  Value *Input = Inst->getOperand(OperandNum);
+  assert(isa<Constant>(Input));
+  auto C = dyn_cast<Constant>(Input);
+  assert(C);
+  unsigned Size = C->getType()->getPrimitiveSizeInBits();
+
+  VISA_EMask_Ctrl ctrlMask = getExecMaskFromWrPredRegion(Inst, true);
+  VISA_Exec_Size execSize = getExecSizeFromValue(Size);
+
+  unsigned IntVal = getPredicateConstantAsInt(C);
+  C = ConstantInt::get(Type::getIntNTy(Inst->getContext(), std::max(Size, 8U)),
+                       IntVal);
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISASetP(ctrlMask, execSize, getPredicateVar(Inst),
+                                   createImmediateOperand(C, UNSIGNED)));
+}
+
+/***********************************************************************
+ * buildLoneOperand : build a rdregion or modifier that is not baled in to
+ *                    a main instruction
+ *
+ * Enter:   Inst = the rdregion or modifier instruction
+ *          BI = BaleInfo for Inst
+ *          Mod = modifier for destination
+ *          WrRegion = 0 else wrregion for destination
+ *          WrRegionBI = BaleInfo for WrRegion (possibly baling in
+ *              variable index add)
+ */
+void GenXKernelBuilder::buildLoneOperand(Instruction *Inst, genx::BaleInfo BI,
+                                         unsigned Mod,
+                                         const DstOpndDesc &DstDesc) {
+  Instruction *WrRegion = DstDesc.WrRegion;
+  BaleInfo WrRegionBI = DstDesc.WrRegionBI;
+
+  VISA_Exec_Size ExecSize = EXEC_SIZE_1;
+  if (VectorType *VT = dyn_cast<VectorType>(Inst->getType()))
+    ExecSize = getExecSizeFromValue(VT->getNumElements());
+  ISA_Opcode Opcode = ISA_MOV;
+  bool Baled = true;
+  VISA_EMask_Ctrl ExecMask = getExecMaskFromWrRegion(DstDesc);
+  // Default source from Inst
+  Value *Src = Inst;
+
+  // Give dest and source the same signedness for byte mov.
+  auto Signed = DONTCARESIGNED;
+  // destination
+  auto Dest = createDestination(Inst, Signed, Mod, DstDesc, &Signed);
+
+  // source
+  if ((Mod & MODIFIER_SAT) != 0 &&
+      Inst->getType()->getScalarType()->isIntegerTy() &&
+         GenXIntrinsic::isIntegerSat(Inst->user_back()))
+    Signed = getISatSrcSign(Inst->user_back());
+
+  if (BI.Type == BaleInfo::NOTMOD) {
+    // A lone "not" is implemented as a not instruction, rather than a mov
+    // with a not modifier. A mov only allows an arithmetic modifier.
+    Opcode = ISA_NOT;
+    Baled = BI.isOperandBaled(0);
+    // In this case the src is actually operand 0 of the noti intrinsic
+    Src = Inst->getOperand(0);
+  } else if (BI.Type == BaleInfo::RDREGION && !Mod) {
+    Register *DstReg;
+    if (WrRegion) {
+      DstReg = RegAlloc->getRegForValueOrNull(KernFunc, WrRegion, DONTCARESIGNED);
+    } else {
+      DstReg = RegAlloc->getRegForValue(KernFunc, Inst, DONTCARESIGNED);
+    }
+    if (DstReg && (DstReg->Category == RegCategory::SURFACE ||
+                   DstReg->Category == RegCategory::SAMPLER ||
+                   DstReg->Category == RegCategory::VME)) {
+      Opcode = ISA_MOVS;
+    }
+  }
+  // TODO: mb need to get signed from dest for src and then modify that
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISADataMovementInst(
+      Opcode, (Opcode != ISA_MOVS ? createPredFromWrRegion(DstDesc) : nullptr),
+      Mod & MODIFIER_SAT, ExecMask, ExecSize, Dest,
+      createSource(Src, Signed, Baled, 0)));
+}
+
+static unsigned getResultedTypeSize(Type *Ty) {
+  unsigned TySz = 0;
+  if (Ty->isVectorTy())
+    TySz = Ty->getVectorNumElements() *
+           getResultedTypeSize(Ty->getVectorElementType());
+  else if (Ty->isArrayTy())
+    TySz = Ty->getArrayNumElements() *
+           getResultedTypeSize(Ty->getArrayElementType());
+  else if (Ty->isStructTy()) {
+    StructType *STy = dyn_cast<StructType>(Ty);
+    assert(STy);
+    for (Type *Ty : STy->elements())
+      TySz += getResultedTypeSize(Ty);
+  } else if (Ty->isPointerTy() && Ty->getPointerElementType()->isFunctionTy()) {
+    TySz = BYTES_PER_FADDR;
+  } else {
+    TySz = Ty->getPrimitiveSizeInBits() / CHAR_BIT;
+    assert(TySz && "Ty is not primitive?");
+  }
+
+  return TySz;
+}
+
+// Check if we're trying to form return value of a structure type
+// TODO:  should check full insert/extract chain (for failed coalescing cases),
+//        e.g. after failed coalescing we may end up having a bunch of
+//        extractvalue, insertvalue and bitcasts inst where only the last one
+//        should be actually lowered
+static bool checkInsertToRetv(InsertValueInst *Inst) {
+  if (auto IVI = dyn_cast<InsertValueInst>(Inst->use_begin()->getUser()))
+    return checkInsertToRetv(IVI);
+  else if (auto RI = dyn_cast<ReturnInst>(Inst->use_begin()->getUser()))
+    return RI->getFunction()->hasFnAttribute(genx::FunctionMD::CMStackCall) ||
+           RI->getFunction()->hasFnAttribute(
+               genx::FunctionMD::ReferencedIndirectly);
+  return false;
+}
+
+/***********************************************************************
+ * buildMainInst : build a main instruction
+ *
+ * Enter:   Inst = the main instruction
+ *          BI = BaleInfo for Inst
+ *          Mod = modifier bits for destination
+ *          WrRegion = 0 else wrregion for destination
+ *          WrRegionBI = BaleInfo for WrRegion (possibly baling in
+ *              variable index add)
+ *
+ * Return:  true if terminator inst that falls through to following block
+ */
+bool GenXKernelBuilder::buildMainInst(Instruction *Inst, BaleInfo BI,
+                                      unsigned Mod,
+                                      const DstOpndDesc &DstDesc) {
+  if (PHINode *Phi = dyn_cast<PHINode>(Inst))
+    buildPhiNode(Phi);
+  else if (ReturnInst *RI = dyn_cast<ReturnInst>(Inst)) {
+    buildRet(RI);
+  } else if (BranchInst *BR = dyn_cast<BranchInst>(Inst)) {
+    return buildBranch(BR);
+  } else if (CmpInst *Cmp = dyn_cast<CmpInst>(Inst)) {
+    buildCmp(Cmp, BI, DstDesc);
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Inst)) {
+    if (!BO->getType()->getScalarType()->isIntegerTy(1)) {
+      buildBinaryOperator(BO, BI, Mod, DstDesc);
+    } else {
+      assert(!Mod && !DstDesc.WrRegion && !BI.isOperandBaled(0) &&
+             !BI.isOperandBaled(1));
+      buildBoolBinaryOperator(BO);
+    }
+  } else if (auto EVI = dyn_cast<ExtractValueInst>(Inst)) {
+    if (auto *CI = dyn_cast<IGCLLVM::CallInst>(Inst->getOperand(0)))
+      // translate extraction of structured type from retv
+      if (!CI->isInlineAsm() && (CI->getCalledFunction()->hasFnAttribute(
+                                     genx::FunctionMD::CMStackCall) ||
+                                 CI->isIndirectCall()))
+        buildExtractRetv(EVI);
+    // no code generated
+  } else if (auto IVI = dyn_cast<InsertValueInst>(Inst)) {
+    if (checkInsertToRetv(IVI)
+        // TODO: safely remove this tmp workaround for failed coalescing cases
+        // and insert-extract-insert chains
+        && !isa<BitCastInst>(Inst->getOperand(1)))
+      RetvInserts.push_back(IVI);
+    // no code generated
+  } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
+    buildBitCast(BCI, BI, Mod, DstDesc);
+  } else if (CastInst *CI = dyn_cast<CastInst>(Inst)) {
+    auto ptr2Int = dyn_cast<PtrToIntInst>(CI);
+    if (ptr2Int && isa<GlobalVariable>(CI->getOperand(0))) {
+      buildSymbolInst(ptr2Int, Mod, DstDesc);
+    } else {
+      buildCastInst(CI, BI, Mod, DstDesc);
+    }
+  } else if (auto SI = dyn_cast<SelectInst>(Inst)) {
+    buildSelectInst(SI, BI, Mod, DstDesc);
+  } else if (auto LI = dyn_cast<LoadInst>(Inst)) {
+    (void)LI; // no code generated
+  } else if (auto GEPI = dyn_cast<GetElementPtrInst>(Inst)) {
+    // check if gepi def is used in intrinsic, otherwise report error
+    auto GepiChecker = [](Use &ui) {
+      auto ci = cast<CallInst>(ui.getUser());
+      Function *Callee = ci->getCalledFunction();
+      unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(Callee);
+      return (IntrinID == GenXIntrinsic::genx_print_format_index);
+    };
+    if (!std::all_of(GEPI->use_begin(), GEPI->use_end(), GepiChecker)) {
+      report_fatal_error("gep is supported only for printf");
+    }
+#if (LLVM_VERSION_MAJOR > 8)
+  } else if (UnaryOperator *UO = dyn_cast<UnaryOperator>(Inst)) {
+    buildUnaryOperator(UO, BI, Mod, DstDesc);
+#endif
+  } else if (auto *CI = dyn_cast<IGCLLVM::CallInst>(Inst)) {
+    if (CI->isInlineAsm())
+      buildInlineAsm(CI);
+    else if (CI->isIndirectCall()) {
+      assert(!Mod && !DstDesc.WrRegion &&
+             "cannot bale subroutine call into anything");
+      buildCall(CI, DstDesc);
+    } else {
+      Function *Callee = CI->getCalledFunction();
+      unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(Callee);
+      switch (IntrinID) {
+      case Intrinsic::dbg_value:
+      case Intrinsic::dbg_declare:
+      case GenXIntrinsic::genx_predefined_surface:
+      case GenXIntrinsic::genx_output:
+        // ignore
+        break;
+      case GenXIntrinsic::genx_simdcf_goto:
+        // A goto that is not baled into a branch (via an extractvalue)
+        buildGoto(CI, nullptr);
+        break;
+      case GenXIntrinsic::genx_simdcf_join:
+        // A join that is not baled into a branch (via an extractvalue)
+        buildJoin(CI, nullptr);
+        break;
+      case GenXIntrinsic::genx_convert:
+        buildConvert(CI, BI, Mod, DstDesc);
+        break;
+      case GenXIntrinsic::genx_print_format_index:
+        buildPrintIndex(CI, IntrinID, Mod, DstDesc);
+        break;
+      case GenXIntrinsic::genx_convert_addr:
+        buildConvertAddr(CI, BI, Mod, DstDesc);
+        break;
+      case GenXIntrinsic::genx_alloca:
+        buildAlloca(CI, IntrinID, Mod, DstDesc);
+        break;
+      case GenXIntrinsic::genx_constanti:
+      case GenXIntrinsic::genx_constantf:
+      case GenXIntrinsic::genx_constantpred:
+        if (isa<UndefValue>(CI->getOperand(0)))
+          return false; // Omit llvm.genx.constant with undef operand.
+        if (!DstDesc.WrRegion && !RegAlloc->getRegForValueOrNull(KernFunc, CI))
+          return false; // Omit llvm.genx.constantpred that is EM or RM and so
+                        // does not have a register allocated.
+                        // fall through...
+      case GenXIntrinsic::genx_barrier:
+        HasBarrier = true;
+      default:
+        if (!(CI->user_empty() &&
+              GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+                  GenXIntrinsic::genx_any))
+          buildIntrinsic(CI, IntrinID, BI, Mod, DstDesc);
+        break;
+      case GenXIntrinsic::not_any_intrinsic:
+        assert(!Mod && !DstDesc.WrRegion &&
+               "cannot bale subroutine call into anything");
+        buildCall(CI, DstDesc);
+        break;
+      }
+    }
+  } else if (isa<UnreachableInst>(Inst))
+    ; // no code generated
+  else
+    report_fatal_error("main inst not implemented");
+
+  return false;
+}
+
+/***********************************************************************
+ * buildPhiNode : build code for a phi node
+ *
+ * A phi node generates no code because coalescing has ensured that all
+ * incomings and the result are in the same register. This function just
+ * asserts that that is the case.
+ */
+void GenXKernelBuilder::buildPhiNode(PHINode *Phi) {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+    Value *Incoming = Phi->getIncomingValue(i);
+    // This assert has to cope with the case that the phi node has no live
+    // range because it is part of an indirected arg/retval in
+    // GenXArgIndirection, or it is an EM/RM category.
+    if (!isa<UndefValue>(Incoming))
+      if (auto LR = Liveness->getLiveRangeOrNull(Incoming))
+        if (LR->getCategory() < RegCategory::NUMREALCATEGORIES)
+          assert(LR == Liveness->getLiveRangeOrNull(Phi) &&
+                 "mismatched registers in phi node");
+  }
+#endif
+}
+
+/***********************************************************************
+ * buildGoto : translate a goto
+ *
+ * Enter:   Goto = goto instruction that is baled into an extractvalue of
+ *                 field 2 (the !any(EM) value), that is baled into Branch
+ *          Branch = branch instruction, 0 if this is a goto that is not
+ *                   baled into a branch, which happens when the goto is
+ *                   followed by a join point so the goto's JIP points there,
+ *                   and LLVM changes the resulting conditional branch with
+ *                   both successors the same into an unconditional branch
+ */
+void GenXKernelBuilder::buildGoto(CallInst *Goto, BranchInst *Branch) {
+  // GenXSimdCFConformance and GenXTidyControlFlow ensure that we have either
+  // 1. a forward goto, where the false successor is fallthrough; or
+  // 2. a backward goto, where the UIP (the join whose RM the goto updates)
+  //    and the true successor are both fallthrough, and the false successor
+  //    is the top of the loop.
+  // (1) generates a vISA forward goto, but the condition has the wrong sense
+  // so we need to invert it.
+  // (2) generates a vISA backward goto.
+  Value *BranchTarget = nullptr;
+  VISA_PREDICATE_STATE StateInvert = PredState_NO_INVERSE;
+  if (!Branch ||
+      Branch->getSuccessor(1) == Branch->getParent()->getNextNode()) {
+    // Forward goto.  Find the join.
+    auto Join = GotoJoin::findJoin(Goto);
+    assert(Join && "join not found");
+    BranchTarget = Join;
+    StateInvert = PredState_INVERSE;
+  } else {
+    assert(Branch->getSuccessor(0) == Branch->getParent()->getNextNode() &&
+           "bad goto structure");
+    // Backward branch.
+    BranchTarget = Branch->getSuccessor(1);
+  }
+  // Get the condition.
+  VISA_EMask_Ctrl Mask = vISA_EMASK_M1;
+  VISA_PREDICATE_CONTROL Control = PRED_CTRL_NON;
+  VISA_PREDICATE_STATE State = PredState_NO_INVERSE;
+
+  Value *Pred = getPredicateOperand(
+      Goto, 2 /*OperandNum*/, Baling->getBaleInfo(Goto), Control, State, &Mask);
+  assert(!Mask && "cannot have rdpredregion baled into goto");
+
+  Register *PredReg = nullptr;
+  if (auto C = dyn_cast<Constant>(Pred)) {
+    (void)C;
+    if (StateInvert)
+      assert(C->isNullValue() &&
+             "predication operand must be constant 0 or not constant");
+    else
+      assert(C->isAllOnesValue() &&
+             "predication operand must be constant 1 or not constant");
+  } else {
+    State ^= StateInvert;
+    PredReg = RegAlloc->getRegForValueOrNull(KernFunc, Pred);
+    assert(PredReg && PredReg->Category == RegCategory::PREDICATE);
+  }
+
+  uint8_t execSize = genx::log2(Pred->getType()->getVectorNumElements());
+
+  // Visa decoder part
+  VISA_EMask_Ctrl emask =
+      VISA_EMask_Ctrl((execSize >> 0x4) & 0xF);
+  VISA_Exec_Size esize = (VISA_Exec_Size)((execSize)&0xF);
+
+  VISA_PredOpnd *pred = nullptr;
+  if (PredReg) {
+    VISA_PredVar *Decl = getPredicateVar(PredReg);
+    VISA_PredOpnd *opnd = createPredOperand(Decl, State, Control);
+    pred = opnd;
+  }
+
+  unsigned LabelID = getOrCreateLabel(BranchTarget, LABEL_BLOCK);
+
+  VISA_LabelOpnd *label = Labels[LabelID];
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISACFGotoInst(pred, emask, esize, label));
+}
+
+// Convert predicate offset to EM offset according to
+// vISA spec 3.3.1 Execution Mask.
+static VISA_EMask_Ctrl getVisaEMOffset(unsigned PredOffset) {
+  switch (PredOffset) {
+  case 0:
+    return vISA_EMASK_M1;
+  case 4:
+    return vISA_EMASK_M2;
+  case 8:
+    return vISA_EMASK_M3;
+  case 12:
+    return vISA_EMASK_M4;
+  case 16:
+    return vISA_EMASK_M5;
+  case 20:
+    return vISA_EMASK_M6;
+  case 24:
+    return vISA_EMASK_M7;
+  case 28:
+    return vISA_EMASK_M8;
+  }
+  llvm_unreachable("Unexpected EM offset");
+}
+
+/***********************************************************************
+ * getPredicateOperand : get predicate operand, scanning through any baled
+ *    in rdpredregion, all, any, not instructions to derive the mask control
+ *    field and the predication field
+ *
+ * Enter:   Inst = instruction to get predicate operand from
+ *          OperandNum = operand number in Inst
+ *          BI = bale info for Inst
+ *          *Control = where to write control information about predicate
+ *          *State = where to write state information about predicate
+ *          *MaskCtrl = where to write mask control field (bits 7..4)
+ *
+ * Return:  Value of mask after scanning through baled in instructions
+ *          *PredField and *MaskCtrl set
+ */
+Value *GenXKernelBuilder::getPredicateOperand(
+    Instruction *Inst, unsigned OperandNum, BaleInfo BI,
+    VISA_PREDICATE_CONTROL &Control, VISA_PREDICATE_STATE &State,
+    VISA_EMask_Ctrl *MaskCtrl) {
+  State = PredState_NO_INVERSE;
+  *MaskCtrl = vISA_EMASK_M1;
+  Control = PRED_CTRL_NON;
+  Value *Mask = Inst->getOperand(OperandNum);
+  // Check for baled in all/any/notp/rdpredregion.
+  while (BI.isOperandBaled(OperandNum)) {
+    Instruction *Inst = dyn_cast<Instruction>(Mask);
+    if (isNot(Inst)) {
+      if (Control != PRED_CTRL_NON) {
+        // switch any<->all as well as invert bit
+        Control ^= (VISA_PREDICATE_CONTROL)(PRED_CTRL_ANY | PRED_CTRL_ALL);
+        State ^= PredState_INVERSE;
+      } else {
+        // all/any not set, just invert invert bit
+        State ^= PredState_INVERSE;
+      }
+      OperandNum = 0;
+      assert(Inst);
+      Mask = Inst->getOperand(OperandNum);
+      BI = Baling->getBaleInfo(Inst);
+      continue;
+    }
+    switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+    case GenXIntrinsic::genx_all:
+      Control |= PRED_CTRL_ALL; // predicate combine field = "all"
+      OperandNum = 0;
+      Mask = Inst->getOperand(OperandNum);
+      BI = Baling->getBaleInfo(Inst);
+      continue;
+    case GenXIntrinsic::genx_any:
+      Control |= PRED_CTRL_ANY; // predicate combine field = "any"
+      OperandNum = 0;
+      Mask = Inst->getOperand(OperandNum);
+      BI = Baling->getBaleInfo(Inst);
+      continue;
+    case GenXIntrinsic::genx_rdpredregion: {
+      // Baled in rdpredregion. Use its constant offset for the mask control
+      // field.
+      unsigned MaskOffset =
+          cast<ConstantInt>(Inst->getOperand(1))->getSExtValue();
+      *MaskCtrl = getVisaEMOffset(MaskOffset);
+      Mask = Inst->getOperand(0);
+      break;
+    }
+    default:
+      break;
+    }
+    // Baled shufflepred. Mask offset is deduced from initial value of slice.
+    if (auto *SVI = dyn_cast<ShuffleVectorInst>(Inst)) {
+      unsigned MaskOffset =
+          ShuffleVectorAnalyzer::getReplicatedSliceDescriptor(SVI)
+              .InitialOffset;
+      *MaskCtrl = getVisaEMOffset(MaskOffset);
+      Mask = SVI->getOperand(0);
+    }
+    break;
+  }
+  return Mask;
+}
+
+void GenXKernelBuilder::AddGenVar(Register &Reg) {
+  auto &DL = FG->getModule()->getDataLayout();
+
+  VISA_GenVar *parentDecl = nullptr;
+  VISA_GenVar *Decl = nullptr;
+
+  if (!Reg.AliasTo) {
+    // This is not an aliased register. Go through all the aliases and
+    // determine the biggest alignment required. If the register is at least
+    // as big as a GRF, make the alignment GRF.
+    unsigned Alignment = 5; // GRF alignment
+    Type *Ty = Reg.Ty;
+    unsigned NBits = Ty->isPointerTy() ? DL.getPointerSizeInBits()
+                                       : Ty->getPrimitiveSizeInBits();
+    if (NBits < GrfByteSize * 8 /* bits in GRF */) {
+      Alignment = 0;
+      for (Register *AliasReg = &Reg; AliasReg;
+           AliasReg = AliasReg->NextAlias[KernFunc]) {
+        Type *AliasTy = AliasReg->Ty->getScalarType();
+        unsigned ThisElementBytes = AliasTy->isPointerTy()
+                                        ? DL.getPointerTypeSize(AliasTy)
+                                        : AliasTy->getPrimitiveSizeInBits() / 8;
+        unsigned LogThisElementBytes = genx::log2(ThisElementBytes);
+        if (LogThisElementBytes > Alignment)
+          Alignment = LogThisElementBytes;
+        if (AliasReg->Alignment > Alignment)
+          Alignment = AliasReg->Alignment;
+      }
+    }
+    for (Register *AliasReg = &Reg; AliasReg; AliasReg = AliasReg->NextAlias[KernFunc]) {
+      if (AliasReg->Alignment < Alignment)
+        AliasReg->Alignment = Alignment;
+    }
+  } else {
+    if (Reg.AliasTo->Num < visa::VISA_NUM_RESERVED_REGS) {
+      CISA_CALL(Kernel->GetPredefinedVar(parentDecl,
+                                         (PreDefined_Vars)Reg.AliasTo->Num));
+      assert(parentDecl && "Predefeined variable is null");
+    } else {
+      parentDecl = Reg.AliasTo->GetVar<VISA_GenVar>(Kernel);
+      assert(parentDecl && "Refers to undefined var");
+    }
+  }
+
+  visa::TypeDetails TD(DL, Reg.Ty, Reg.Signed);
+
+  CISA_CALL(Kernel->CreateVISAGenVar(
+      Decl, Reg.NameStr.c_str(), TD.NumElements,
+      static_cast<VISA_Type>(TD.VisaType),
+      // 0x7 is a hack because for some reasons
+      // alignment can be a large number
+      static_cast<VISA_Align>(Reg.Alignment & 0x7), parentDecl, 0));
+
+  Reg.SetVar(Kernel, Decl);
+
+  for (auto &Attr : Reg.Attributes) {
+    CISA_CALL(Kernel->AddAttributeToVar(
+        Decl, getStringByIndex(Attr.first).begin(), Attr.second.size(),
+        (void *)(Attr.second.c_str())));
+  }
+}
+/**************************************************************************************************
+ * Scan ir to collect information about whether kernel has callable function or
+ * barrier.
+ */
+void GenXKernelBuilder::collectKernelInfo() {
+  for (auto It = FG->begin(), E = FG->end(); It != E; ++It) {
+    auto Func = *It;
+    HasStackcalls |=
+        Func->hasFnAttribute(genx::FunctionMD::CMStackCall) ||
+        Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly);
+    for (auto &BB : *Func) {
+      for (auto &I : BB) {
+        if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+          if (CI->isInlineAsm())
+            continue;
+          if (GenXIntrinsicInst *II = dyn_cast<GenXIntrinsicInst>(CI)) {
+            auto IID = II->getIntrinsicID();
+            if (IID == GenXIntrinsic::genx_barrier)
+              HasBarrier = true;
+            else if (IID == GenXIntrinsic::genx_alloca)
+              HasAlloca = true;
+          } else {
+            Function *Callee = CI->getCalledFunction();
+            if (Callee && Callee->hasFnAttribute("CMCallable"))
+              HasCallable = true;
+          }
+        }
+      }
+    }
+  }
+}
+/**************************************************************************************************
+ * Build variables
+ */
+void GenXKernelBuilder::buildVariables() {
+  RegAlloc->SetRegPushHook(this, [](void *Object, GenXVisaRegAlloc::Reg &Reg) {
+    static_cast<GenXKernelBuilder *>(Object)->AddGenVar(Reg);
+  });
+
+  for (auto &It : RegAlloc->getRegStorage()) {
+    Register *Reg = &(It);
+    switch (Reg->Category) {
+    case RegCategory::GENERAL:
+      AddGenVar(*Reg);
+      break;
+
+    case RegCategory::ADDRESS: {
+      VISA_AddrVar *Decl = nullptr;
+      unsigned NumElements = 1;
+      if (VectorType *VT = dyn_cast<VectorType>(Reg->Ty))
+        NumElements = VT->getNumElements();
+      CISA_CALL(
+          Kernel->CreateVISAAddrVar(Decl, Reg->NameStr.c_str(), NumElements));
+      Reg->SetVar(Kernel, Decl);
+      for (auto &Attr : Reg->Attributes) {
+        CISA_CALL(Kernel->AddAttributeToVar(
+            Decl, getStringByIndex(Attr.first).begin(), Attr.second.size(),
+            (void *)(Attr.second.c_str())));
+      }
+    } break;
+
+    case RegCategory::PREDICATE: {
+      VISA_PredVar *Decl = nullptr;
+      unsigned NumElements = 1;
+      if (VectorType *VT = dyn_cast<VectorType>(Reg->Ty))
+        NumElements = VT->getNumElements();
+      CISA_CALL(
+          Kernel->CreateVISAPredVar(Decl, Reg->NameStr.c_str(), NumElements));
+      Reg->SetVar(Kernel, Decl);
+      for (auto &Attr : Reg->Attributes) {
+        CISA_CALL(Kernel->AddAttributeToVar(
+            Decl, getStringByIndex(Attr.first).begin(), Attr.second.size(),
+            (void *)(Attr.second.c_str())));
+      }
+    } break;
+
+    case RegCategory::SAMPLER: {
+      unsigned NumElements = 1;
+      if (VectorType *VT = dyn_cast<VectorType>(Reg->Ty))
+        NumElements = VT->getNumElements();
+      VISA_SamplerVar *Decl = nullptr;
+      CISA_CALL(Kernel->CreateVISASamplerVar(Decl, Reg->NameStr.c_str(),
+                                             NumElements));
+      Reg->SetVar(Kernel, Decl);
+    } break;
+
+    case RegCategory::SURFACE: {
+      VISA_SurfaceVar *Decl = nullptr;
+      if (Reg->Num < visa::VISA_NUM_RESERVED_SURFACES) {
+        Kernel->GetPredefinedSurface(Decl, (PreDefined_Surface)Reg->Num);
+      } else {
+        unsigned NumElements = 1;
+        if (VectorType *VT = dyn_cast<VectorType>(Reg->Ty))
+          NumElements = VT->getNumElements();
+
+        CISA_CALL(Kernel->CreateVISASurfaceVar(Decl, Reg->NameStr.c_str(),
+                                               NumElements));
+      }
+      Reg->SetVar(Kernel, Decl);
+    } break;
+
+    case RegCategory::VME:
+      report_fatal_error("VME variable is no longer supported");
+      break;
+
+    default:
+      report_fatal_error("Unknown category for register");
+      break;
+    }
+  }
+
+  VISA_GenVar *ArgDecl = nullptr, *RetDecl = nullptr;
+  Kernel->GetPredefinedVar(ArgDecl, PREDEFINED_ARG);
+  Kernel->GetPredefinedVar(RetDecl, PREDEFINED_RET);
+  createCisaVariable(Kernel, "argv", ArgDecl, ARG_SIZE_IN_GRFS * GrfByteSize);
+  createCisaVariable(Kernel, "retv", RetDecl, RET_SIZE_IN_GRFS * GrfByteSize);
+}
+
+/***********************************************************************
+ * getExecMaskFromWrPredRegion : write exec size field from wrpredregion
+ *        or wrpredpredregion instruction
+ *
+ * Enter:   ExecSize = execution size
+ *          WrPredRegion = 0 else wrpredregion instruction
+ *
+ * The exec size byte includes the mask control field, which we need to set
+ * up from the wrpredregion/wrpredpredregion.
+ */
+VISA_EMask_Ctrl
+GenXKernelBuilder::getExecMaskFromWrPredRegion(Instruction *WrPredRegion,
+                                               bool IsNoMask) {
+  VISA_EMask_Ctrl MaskCtrl =
+      (IsNoMask | NoMask) ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
+  if (WrPredRegion) {
+    // Get the mask control field from the offset in the wrpredregion.
+    unsigned MaskOffset =
+        cast<ConstantInt>(WrPredRegion->getOperand(2))->getSExtValue();
+    assert(MaskOffset < 32 && "unexpected mask offset");
+    MaskCtrl = static_cast<VISA_EMask_Ctrl>(MaskOffset >> 2);
+  }
+
+  // Set to NoMask if requested. Otherwise use the default NM mode
+  // when WrPredRegion is null.
+  if ((IsNoMask && MaskCtrl < vISA_EMASK_M1_NM) ||
+      (!WrPredRegion && NoMask && MaskCtrl < vISA_EMASK_M1_NM))
+    MaskCtrl = static_cast<VISA_EMask_Ctrl>(
+        static_cast<unsigned>(MaskCtrl) + vISA_EMASK_M1_NM);
+
+  return MaskCtrl;
+}
+
+/***********************************************************************
+ * getExecMaskFromWrRegion : get exec size field from wrregion instruction
+ *
+ * Enter:   ExecSize = execution size
+ *          WrRegion = 0 else wrregion instruction
+ *          WrRegionBI = BaleInfo for wrregion, so we can see if there is a
+ *                rdpredregion baled in to the mask
+ *
+ * If WrRegion != 0, and it has a mask that is not constant 1, then the
+ * mask must be a predicate register.
+ *
+ * The exec size byte includes the mask control field, which we need to set
+ * up from any rdpredregion baled in to a predicated wrregion.
+ *
+ * If the predicate has no register allocated, it must be EM, and we set the
+ * instruction to be masked. Otherwise we set nomask.
+ */
+VISA_EMask_Ctrl
+GenXKernelBuilder::getExecMaskFromWrRegion(const DstOpndDesc &DstDesc,
+                                           bool IsNoMask) {
+  // Override mask control if requested.
+  auto MaskCtrl = (IsNoMask | NoMask) ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
+
+  if (DstDesc.WrRegion) {
+    // Get the predicate (mask) operand, scanning through baled in
+    // all/any/not/rdpredregion and setting PredField and MaskCtrl
+    // appropriately.
+    VISA_PREDICATE_CONTROL Control = PRED_CTRL_NON;
+    VISA_PREDICATE_STATE State = PredState_NO_INVERSE;
+    Value *Mask =
+        getPredicateOperand(DstDesc.WrRegion, 7 /*mask operand in wrregion*/,
+                            DstDesc.WrRegionBI, Control, State, &MaskCtrl);
+    if ((isa<Constant>(Mask) || RegAlloc->getRegForValueOrNull(KernFunc, Mask)) && NoMask)
+      MaskCtrl |= vISA_EMASK_M1_NM;
+  }
+  return MaskCtrl;
+}
+
+/***********************************************************************
+ * buildIntrinsic : build code for an intrinsic
+ *
+ * Enter:   CI = the CallInst
+ *          IntrinID = intrinsic ID
+ *          BI = BaleInfo for the instruction
+ *          Mod = modifier bits for destination
+ *          WrRegion = 0 else wrregion for destination
+ *          WrRegionBI = BaleInfo for WrRegion
+ */
+void GenXKernelBuilder::buildIntrinsic(CallInst *CI, unsigned IntrinID,
+                                       BaleInfo BI, unsigned Mod,
+                                       const DstOpndDesc &DstDesc) {
+  using II = GenXIntrinsicInfo;
+  LLVM_DEBUG(dbgs() << "buildIntrinsic: " << *CI << "\n");
+
+  int MaxRawOperands = std::numeric_limits<int>::max();
+
+  // TODO: replace lambdas by methods
+
+  auto GetUnsignedValue = [&](II::ArgInfo AI) {
+    ConstantInt *Const =
+        dyn_cast<ConstantInt>(CI->getArgOperand(AI.getArgIdx()));
+    if (!Const)
+      report_fatal_error("Incorrect args to intrinsic call");
+    return static_cast<unsigned>(Const->getSExtValue());
+  };
+
+  auto CreateSurfaceOperand = [&](II::ArgInfo AI) {
+    llvm::Value *Arg = CI->getArgOperand(AI.getArgIdx());
+    VISA_SurfaceVar *SurfDecl = nullptr;
+    int Index = visa::convertToSurfaceIndex(Arg);
+    if (visa::isReservedSurfaceIndex(Index)) {
+      Kernel->GetPredefinedSurface(SurfDecl, visa::getReservedSurface(Index));
+    } else {
+      Register *Reg = RegAlloc->getRegForValue(KernFunc, Arg);
+      assert(Reg->Category == RegCategory::SURFACE &&
+             "Expected surface register");
+      SurfDecl = Reg->GetVar<VISA_SurfaceVar>(Kernel);
+    }
+    VISA_StateOpndHandle *ResultOperand = nullptr;
+    CISA_CALL(Kernel->CreateVISAStateOperandHandle(ResultOperand, SurfDecl));
+    return ResultOperand;
+  };
+
+  auto CreateSamplerOperand = [&](II::ArgInfo AI) {
+    Register *Reg = RegAlloc->getRegForValue(KernFunc, CI->getArgOperand(AI.getArgIdx()));
+    assert(Reg->Category == RegCategory::SAMPLER &&
+           "Expected sampler register");
+    VISA_StateOpndHandle *ResultOperand = nullptr;
+    CISA_CALL(Kernel->CreateVISAStateOperandHandle(
+        ResultOperand, Reg->GetVar<VISA_SamplerVar>(Kernel)));
+    return ResultOperand;
+  };
+
+  auto GetMediaHeght = [&](II::ArgInfo AI) {
+    // constant byte for media height that we need to infer from the
+    // media width and the return type or final arg
+    ConstantInt *Const =
+        dyn_cast<ConstantInt>(CI->getArgOperand(AI.getArgIdx()));
+    if (!Const)
+      report_fatal_error("Incorrect args to intrinsic call");
+    unsigned Width = Const->getZExtValue();
+    if (Width == 0 || Width > 64)
+      report_fatal_error("Invalid media width");
+    unsigned RoundedWidth = 1 << genx::log2(Width);
+    if (RoundedWidth < Width)
+      RoundedWidth *= 2;
+    if (RoundedWidth < 4)
+      RoundedWidth = 4;
+    Type *DataType = CI->getType();
+    if (DataType->isVoidTy())
+      DataType = CI->getOperand(CI->getNumArgOperands() - 1)->getType();
+    unsigned DataSize;
+    if (VectorType *VT = dyn_cast<VectorType>(DataType))
+      DataSize = VT->getElementType()->getPrimitiveSizeInBits() / 8 *
+                 VT->getNumElements();
+    else
+      DataSize = DataType->getPrimitiveSizeInBits() / 8;
+    if (DataSize <= RoundedWidth && DataSize >= Width)
+      return static_cast<uint8_t>(1);
+    if (DataSize % RoundedWidth)
+      report_fatal_error("Invalid media width");
+    return static_cast<uint8_t>(DataSize / RoundedWidth);
+  };
+
+  auto CreateOperand = [&](II::ArgInfo AI) {
+    VISA_VectorOpnd *ResultOperand = nullptr;
+    Signedness Signed = DONTCARESIGNED;
+    if (AI.needsSigned())
+      Signed = SIGNED;
+    else if (AI.needsUnsigned())
+      Signed = UNSIGNED;
+    if (AI.isRet()) {
+      if (AI.getSaturation() == II::SATURATION_SATURATE)
+        Mod |= MODIFIER_SAT;
+      ResultOperand = createDestination(CI, Signed, Mod, DstDesc);
+    } else {
+      unsigned MaxWidth = 16;
+      if (AI.getRestriction() == II::TWICEWIDTH) {
+        // For a TWICEWIDTH operand, do not allow width bigger than the
+        // execution size.
+        MaxWidth = CI->getType()->getVectorNumElements();
+      }
+      ResultOperand = createSourceOperand(CI, Signed, AI.getArgIdx(), BI, 0,
+                                          nullptr, MaxWidth);
+    }
+    return ResultOperand;
+  };
+
+  auto CreateRawOperand = [&](II::ArgInfo AI) {
+    VISA_RawOpnd *ResultOperand = nullptr;
+    auto Signed = DONTCARESIGNED;
+    if (AI.needsSigned())
+      Signed = SIGNED;
+    else if (AI.needsUnsigned())
+      Signed = UNSIGNED;
+    if (AI.isRet()) {
+      assert(!Mod);
+      ResultOperand = createRawDestination(CI, DstDesc, Signed);
+    } else if (AI.getArgIdx() < MaxRawOperands)
+      ResultOperand = createRawSourceOperand(CI, AI.getArgIdx(), BI, Signed);
+    return ResultOperand;
+  };
+
+  auto CreateRawOperands = [&](II::ArgInfo AI, VISA_RawOpnd **Operands) {
+    assert(MaxRawOperands != std::numeric_limits<int>::max() &&
+           "MaxRawOperands must be defined");
+    for (int i = 0; i < AI.getArgIdx() + MaxRawOperands; ++i) {
+      Operands[i] = CreateRawOperand(II::ArgInfo(II::RAW | (AI.Info + i)));
+    }
+  };
+
+  auto GetOwords = [&](II::ArgInfo AI) {
+    // constant byte for log2 number of owords
+    Value *Arg = CI;
+    if (!AI.isRet())
+      Arg = CI->getOperand(AI.getArgIdx());
+    VectorType *VT = dyn_cast<VectorType>(Arg->getType());
+    if (!VT)
+      report_fatal_error("Invalid number of owords");
+    int DataSize = VT->getNumElements() *
+                   DL.getTypeSizeInBits(VT->getElementType()) / 8;
+    DataSize = genx::exactLog2(DataSize) - 4;
+    if (DataSize < 0 || DataSize > 4)
+      report_fatal_error("Invalid number of words");
+    return static_cast<VISA_Oword_Num>(DataSize);
+  };
+
+  auto GetExecSize = [&](II::ArgInfo AI, VISA_EMask_Ctrl *Mask) {
+    int ExecSize = GenXIntrinsicInfo::getOverridedExecSize(CI, Subtarget);
+    if (ExecSize == 0) {
+      if (VectorType *VT = dyn_cast<VectorType>(CI->getType())) {
+        ExecSize = VT->getNumElements();
+      } else {
+        ExecSize = 1;
+      }
+    }
+    bool IsNoMask = AI.getCategory() == II::EXECSIZE_NOMASK;
+    *Mask = getExecMaskFromWrRegion(DstDesc, IsNoMask);
+    return getExecSizeFromValue(ExecSize);
+  };
+
+  auto GetExecSizeFromArg = [&](II::ArgInfo AI,
+                                VISA_EMask_Ctrl *ExecMask) {
+    // exec_size inferred from width of predicate arg, defaulting to 16 if
+    // it is scalar i1 (as can happen in raw send). Also get M3 etc flag
+    // if the predicate has a baled in rdpredregion, and mark as nomask if
+    // the predicate is not EM.
+    int ExecSize;
+    *ExecMask = NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
+    // Get the predicate (mask) operand, scanning through baled in
+    // all/any/not/rdpredregion and setting PredField and MaskCtrl
+    // appropriately.
+    VISA_PREDICATE_CONTROL Control;
+    VISA_PREDICATE_STATE State;
+    Value *Mask =
+        getPredicateOperand(CI, AI.getArgIdx(), BI, Control, State, ExecMask);
+    if (isa<Constant>(Mask) || RegAlloc->getRegForValueOrNull(KernFunc, Mask))
+      *ExecMask |= NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
+    if (auto VT =
+            dyn_cast<VectorType>(CI->getOperand(AI.getArgIdx())->getType()))
+      ExecSize = VT->getNumElements();
+    else
+      ExecSize = GenXIntrinsicInfo::getOverridedExecSize(CI, Subtarget);
+    return getExecSizeFromValue(ExecSize);
+  };
+
+  auto GetExecSizeFromByte = [&](II::ArgInfo AI, VISA_EMask_Ctrl *Mask) {
+    ConstantInt *Const =
+      dyn_cast<ConstantInt>(CI->getArgOperand(AI.getArgIdx()));
+    if (!Const)
+      report_fatal_error("Incorrect args to intrinsic call");
+    unsigned Byte = Const->getSExtValue() & 15;
+    *Mask = (VISA_EMask_Ctrl)(Byte >> 4);
+    unsigned Res = Byte & 0xF;
+    assert(Res <= 5 &&
+        "illegal common ISA execsize (should be 1, 2, 4, 8, 16, 32).");
+    return (VISA_Exec_Size)Res;
+  };
+
+  auto CreateImplicitPredication = [&](II::ArgInfo AI) {
+    return createPredFromWrRegion(DstDesc);
+  };
+
+  auto CreatePredication = [&](II::ArgInfo AI) {
+    return createPred(CI, BI, AI.getArgIdx());
+  };
+
+  auto GetPredicateVar = [&](II::ArgInfo AI) {
+    if (AI.isRet())
+      return getPredicateVar(CI);
+    else
+      return getPredicateVar(CI->getArgOperand(AI.getArgIdx()));
+  };
+
+  auto GetZeroedPredicateVar = [&](II::ArgInfo AI) {
+    if (AI.isRet())
+      return getZeroedPredicateVar(CI);
+    else
+      return getZeroedPredicateVar(CI->getArgOperand(AI.getArgIdx()));
+  };
+
+  auto CreateNullRawOperand = [&](II::ArgInfo AI) {
+    VISA_RawOpnd *ResultOperand = nullptr;
+    CISA_CALL(Kernel->CreateVISANullRawOperand(ResultOperand, false));
+    return ResultOperand;
+  };
+
+  auto ProcessTwoAddr = [&](II::ArgInfo AI) {
+    if (AI.getCategory() != II::TWOADDR)
+      return;
+    auto Reg = RegAlloc->getRegForValueOrNull(KernFunc, CI, DONTCARESIGNED);
+    if (isa<UndefValue>(CI->getArgOperand(AI.getArgIdx())) && Reg &&
+        isInLoop(CI->getParent()))
+      addLifetimeStartInst(CI);
+  };
+
+  // Constant vector of i1 (or just scalar i1) as i32 (used in setp)
+  auto ConstVi1Asi32 = [&](II::ArgInfo AI) {
+    VISA_VectorOpnd *ResultOperand = nullptr;
+    auto C = cast<Constant>(CI->getArgOperand(AI.getArgIdx()));
+    // Get the bit value of the vXi1 constant.
+    unsigned IntVal = getPredicateConstantAsInt(C);
+    // unsigned i32 constant source operand
+    CISA_CALL(Kernel->CreateVISAImmediate(ResultOperand, &IntVal, ISA_TYPE_UD));
+    return ResultOperand;
+  };
+
+  auto CreateAddressOperand = [&](II::ArgInfo AI) {
+    if (AI.isRet())
+      return createAddressOperand(CI, true);
+    else
+      return createAddressOperand(CI->getArgOperand(AI.getArgIdx()), false);
+  };
+
+  auto GetArgCount = [&](II::ArgInfo AI) {
+    auto BaseArg = AI.getArgIdx();
+    MaxRawOperands = BaseArg;
+
+    for (unsigned Idx = BaseArg; Idx < CI->getNumArgOperands(); ++Idx) {
+      if (auto CA = dyn_cast<Constant>(CI->getArgOperand(Idx))) {
+        if (CA->isNullValue())
+          continue;
+      }
+      MaxRawOperands = Idx + 1;
+    }
+
+    if (MaxRawOperands < BaseArg + AI.getArgCountMin())
+      MaxRawOperands = BaseArg + AI.getArgCountMin();
+
+    return MaxRawOperands - AI.getArgIdx();
+  };
+
+  auto GetNumGrfs = [&](II::ArgInfo AI) {
+    // constant byte for number of GRFs
+    Value *Arg = CI;
+    if (!AI.isRet())
+      Arg = CI->getOperand(AI.getArgIdx());
+    VectorType *VT = dyn_cast<VectorType>(Arg->getType());
+    if (!VT)
+      report_fatal_error("Invalid number of GRFs");
+    int DataSize = VT->getNumElements() *
+                   VT->getElementType()->getPrimitiveSizeInBits() / 8;
+    return (uint8_t)((DataSize + (GrfByteSize - 1)) / GrfByteSize);
+  };
+
+  auto GetSampleChMask = [&](II::ArgInfo AI) {
+    ConstantInt *Const =
+        dyn_cast<ConstantInt>(CI->getArgOperand(AI.getArgIdx()));
+    if (!Const)
+      report_fatal_error("Incorrect args to intrinsic call");
+    unsigned Byte = Const->getSExtValue() & 15;
+    // Find the U_offset arg. It is the first vector arg after this one.
+    VectorType *VT;
+    for (unsigned Idx = AI.getArgIdx() + 1;
+         !(VT = dyn_cast<VectorType>(CI->getOperand(Idx)->getType())); ++Idx)
+      ;
+    unsigned Width = VT->getNumElements();
+    if (Width != 8 && Width != 16)
+      report_fatal_error("Invalid execution size for load/sample");
+    Byte |= Width & 16;
+    return Byte;
+  };
+
+  auto GetSvmGatherBlockSize = [&](II::ArgInfo AI) {
+    // svm gather/scatter "block size" field, set to reflect the element
+    // type of the data
+    Value *V = CI;
+    if (!AI.isRet())
+      V = CI->getArgOperand(AI.getArgIdx());
+    unsigned ElBytes =
+        V->getType()->getScalarType()->getPrimitiveSizeInBits() / 8;
+    switch (ElBytes) {
+      // For N = 2 byte data type, use block size 1 and block count 2.
+      // Otherwise, use block size N and block count 1.
+    case 2:
+    case 1:
+      ElBytes = 0;
+      break;
+    case 4:
+      ElBytes = 1;
+      break;
+    case 8:
+      ElBytes = 2;
+      break;
+    default:
+      report_fatal_error("Bad element type for SVM scatter/gather");
+    }
+    return ElBytes;
+  };
+
+  auto CreateOpndPredefinedSrc = [&](PreDefined_Vars RegId, unsigned ROffset,
+                                     unsigned COffset, unsigned VStride,
+                                     unsigned Width, unsigned HStride) {
+    VISA_GenVar *Decl = nullptr;
+    CISA_CALL(Kernel->GetPredefinedVar(Decl, RegId));
+    VISA_VectorOpnd *ResultOperand = nullptr;
+    CISA_CALL(Kernel->CreateVISASrcOperand(ResultOperand, Decl,
+                                           (VISA_Modifier)Mod, VStride, Width,
+                                           HStride, ROffset, COffset));
+    return ResultOperand;
+  };
+
+  auto CreateOpndPredefinedDst = [&](PreDefined_Vars RegId, unsigned ROffset,
+                                     unsigned COffset, unsigned HStride) {
+    VISA_GenVar *Decl = nullptr;
+    CISA_CALL(Kernel->GetPredefinedVar(Decl, RegId));
+    VISA_VectorOpnd *ResultOperand = nullptr;
+    CISA_CALL(Kernel->CreateVISADstOperand(ResultOperand, Decl, HStride,
+                                           ROffset, COffset));
+    return ResultOperand;
+  };
+
+  auto CreateImmOpndFromUInt = [&](VISA_Type ImmType, unsigned Val) {
+    VISA_VectorOpnd *src = nullptr;
+    CISA_CALL(Kernel->CreateVISAImmediate(src, &Val, ImmType));
+
+    return src;
+  };
+
+
+  VISA_EMask_Ctrl exec_mask;
+  addDebugInfo();
+#include "GenXIntrinsicsBuildMap.inc"
+}
+
+/**************************************************************************************************
+ * buildControlRegUpdate : generate an instruction to apply a mask to
+ *                         the control register (V14).
+ *
+ * Enter:   Mask = the mask to apply
+ *          Clear = false if bits set in Mask should be set in V14,
+ *                  true if bits set in Mask should be cleared in V14.
+ */
+void GenXKernelBuilder::buildControlRegUpdate(unsigned Mask, bool Clear) {
+  ISA_Opcode Opcode;
+  // write opcode
+  if (Clear) {
+    Opcode = ISA_AND;
+    Mask = ~Mask;
+  } else
+    Opcode = ISA_OR;
+
+  Region Single = Region(1, 4);
+
+  VISA_GenVar *Decl = nullptr;
+  CISA_CALL(Kernel->GetPredefinedVar(Decl, PREDEFINED_CR0));
+  VISA_VectorOpnd *dst =
+      createRegionOperand(&Single, Decl, DONTCARESIGNED, 0, true);
+  VISA_VectorOpnd *src0 =
+      createRegionOperand(&Single, Decl, DONTCARESIGNED, 0, false);
+
+  VISA_VectorOpnd *src1 = nullptr;
+  CISA_CALL(Kernel->CreateVISAImmediate(src1, &Mask, ISA_TYPE_UD));
+
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISALogicOrShiftInst(Opcode, nullptr, false,
+                                               vISA_EMASK_M1, EXEC_SIZE_1, dst,
+                                               src0, src1, nullptr, nullptr));
+}
+
+/***********************************************************************
+ * buildBranch : build a conditional or unconditional branch
+ *
+ * Return:  true if fell through to successor
+ */
+bool GenXKernelBuilder::buildBranch(BranchInst *Branch) {
+  BasicBlock *Next = Branch->getParent()->getNextNode();
+  if (Branch->isUnconditional()) {
+    // Unconditional branch
+    if (Branch->getOperand(0) == Next)
+      return true; // fall through to successor
+    auto labelId = getOrCreateLabel(Branch->getSuccessor(0), LABEL_BLOCK);
+    addDebugInfo();
+    CISA_CALL(Kernel->AppendVISACFJmpInst(nullptr, Labels[labelId]));
+    return false;
+  }
+  // Conditional branch.
+  // First check if it is a baled in goto/join, via an extractvalue.
+  auto BI = Baling->getBaleInfo(Branch);
+  if (BI.isOperandBaled(0 /*condition*/)) {
+    if (auto Extract = dyn_cast<ExtractValueInst>(Branch->getCondition())) {
+      auto GotoJoin = cast<CallInst>(Extract->getAggregateOperand());
+      if (GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_goto) {
+        buildGoto(GotoJoin, Branch);
+      } else {
+        assert(GotoJoin::isValidJoin(GotoJoin) &&
+               "extra unexpected code in join block");
+        buildJoin(GotoJoin, Branch);
+      }
+      return true;
+    }
+  }
+  // Normal conditional branch.
+  VISA_EMask_Ctrl MaskCtrl;
+  VISA_PREDICATE_CONTROL Control = PRED_CTRL_NON;
+  VISA_PREDICATE_STATE State = PredState_NO_INVERSE;
+  Value *Pred = getPredicateOperand(Branch, 0, BI, Control, State, &MaskCtrl);
+  assert(!isa<VectorType>(Branch->getCondition()->getType()) &&
+         "branch must have scalar condition");
+  BasicBlock *True = Branch->getSuccessor(0);
+  BasicBlock *False = Branch->getSuccessor(1);
+  if (True == Next) {
+    State ^= PredState_INVERSE; // invert bit in predicate field
+    True = False;
+    False = Next;
+  }
+  // Write the conditional branch.
+  VISA_PredVar *PredVar = getPredicateVar(Pred);
+  VISA_PredOpnd* PredOperand = createPredOperand(PredVar, State, Control);
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISACFJmpInst(
+      PredOperand, Labels[getOrCreateLabel(True, LABEL_BLOCK)]));
+  // If the other successor is not the next block, write an unconditional
+  // jmp to that.
+  if (False == Next)
+    return true; // fall through to successor
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISACFJmpInst(
+      nullptr, Labels[getOrCreateLabel(False, LABEL_BLOCK)]));
+  return false;
+}
+
+/***********************************************************************
+ * buildJoin : build a join
+ *
+ * Enter:   Join = join instruction that is baled into an extractvalue of
+ *                 field 1 (the !any(EM) value), that is baled into Branch,
+ *                 if Branch is non-zero
+ *          Branch = branch instruction, or 0 for a join that is not baled
+ *                   in to a branch because it always ends up with at least
+ *                   one channel enabled
+ */
+void GenXKernelBuilder::buildJoin(CallInst *Join, BranchInst *Branch) {
+  // A join needs a label. (If the join is at the start of its block, then
+  // this gets merged into the block label.)
+  addLabelInst(Join);
+  // There is no join instruction in vISA -- the finalizer derives it by
+  // looking for gotos targeting the basic block's label.
+}
+
+#if (LLVM_VERSION_MAJOR > 8)
+/***********************************************************************
+ * buildUnaryOperator : build code for an unary operator
+ *
+ * Enter:   UO = the UnaryOperator
+ *          BI = BaleInfo for UO
+ *          Mod = modifier bits for destination
+ *          WrRegion = 0 else wrregion for destination
+ *          WrRegionBI = BaleInfo for WrRegion
+ */
+void GenXKernelBuilder::buildUnaryOperator(UnaryOperator *UO, BaleInfo BI,
+                                           unsigned Mod,
+                                           const DstOpndDesc &DstDesc) {
+  ISA_Opcode Opcode = ISA_RESERVED_0;
+  Signedness DstSigned = SIGNED;
+  Signedness SrcSigned = SIGNED;
+  unsigned Mod1 = 0;
+  VISA_Exec_Size ExecSize = EXEC_SIZE_1;
+  VectorType *VT = dyn_cast<VectorType>(UO->getType());
+  if (VT != nullptr)
+    ExecSize = getExecSizeFromValue(VT->getNumElements());
+
+  switch (UO->getOpcode()) {
+    case Instruction::FNeg:
+      Opcode = ISA_MOV;
+      Mod1 ^= MODIFIER_NEG;
+      break;
+    default:
+      report_fatal_error("buildUnaryOperator: unimplemented unary operator");
+  }
+
+  VISA_VectorOpnd *Dst = createDestination(UO, DstSigned, Mod, DstDesc);
+
+  VISA_VectorOpnd *Src0 = nullptr;
+  VISA_PredOpnd *Pred = createPredFromWrRegion(DstDesc);
+
+  Src0 = createSourceOperand(UO, SrcSigned, 0, BI, Mod1);
+
+  auto ExecMask = getExecMaskFromWrRegion(DstDesc);
+
+  addDebugInfo();
+
+  if (Opcode == ISA_MOV) {
+    CISA_CALL(Kernel->AppendVISADataMovementInst(
+        ISA_MOV, Pred, Mod1 & MODIFIER_SAT, ExecMask, ExecSize, Dst, Src0, NULL));
+    return;
+  }
+  report_fatal_error("buildUnaryOperator: unimplemented opcode");
+}
+#endif
+
+/***********************************************************************
+ * buildBinaryOperator : build code for a binary operator
+ *
+ * Enter:   BO = the BinaryOperator
+ *          BI = BaleInfo for BO
+ *          Mod = modifier bits for destination
+ *          WrRegion = 0 else wrregion for destination
+ *          WrRegionBI = BaleInfo for WrRegion
+ */
+void GenXKernelBuilder::buildBinaryOperator(BinaryOperator *BO, BaleInfo BI,
+                                            unsigned Mod,
+                                            const DstOpndDesc &DstDesc) {
+  bool IsLogic = false;
+  ISA_Opcode Opcode = ISA_RESERVED_0;
+  Signedness DstSigned = SIGNED;
+  Signedness SrcSigned = SIGNED;
+  unsigned Mod1 = 0;
+  VISA_Exec_Size ExecSize = EXEC_SIZE_1;
+  if (VectorType *VT = dyn_cast<VectorType>(BO->getType()))
+    ExecSize = getExecSizeFromValue(VT->getNumElements());
+  switch (BO->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::FAdd:
+    Opcode = ISA_ADD;
+    break;
+  case Instruction::Sub:
+  case Instruction::FSub:
+    Opcode = ISA_ADD;
+    Mod1 ^= MODIFIER_NEG;
+    break;
+  case Instruction::Mul:
+  case Instruction::FMul:
+    Opcode = ISA_MUL;
+    break;
+  case Instruction::Shl:
+    Opcode = ISA_SHL;
+    IsLogic = true;
+    break;
+  case Instruction::AShr:
+    Opcode = ISA_ASR;
+    IsLogic = true;
+    break;
+  case Instruction::LShr:
+    Opcode = ISA_SHR;
+    DstSigned = SrcSigned = UNSIGNED;
+    IsLogic = true;
+    break;
+  case Instruction::UDiv:
+    Opcode = ISA_DIV;
+    DstSigned = SrcSigned = UNSIGNED;
+    break;
+  case Instruction::SDiv:
+    Opcode = ISA_DIV;
+    break;
+  case Instruction::FDiv: {
+    Opcode = ISA_DIV;
+    if (Constant *Op0 = dyn_cast<Constant>(BO->getOperand(0))) {
+      if (Op0->getType()->isVectorTy())
+        Op0 = Op0->getSplatValue();
+      ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(Op0);
+      if (CFP && CFP->isExactlyValue(1.0))
+        Opcode = ISA_INV;
+    }
+  } break;
+  case Instruction::URem:
+    Opcode = ISA_MOD;
+    DstSigned = SrcSigned = UNSIGNED;
+    break;
+  case Instruction::SRem:
+  case Instruction::FRem:
+    Opcode = ISA_MOD;
+    break;
+  case Instruction::And:
+    Opcode = ISA_AND;
+    IsLogic = true;
+    break;
+  case Instruction::Or:
+    Opcode = ISA_OR;
+    IsLogic = true;
+    break;
+  case Instruction::Xor:
+    Opcode = ISA_XOR;
+    IsLogic = true;
+    break;
+  default:
+    report_fatal_error("buildBinaryOperator: unimplemented binary operator");
+    break;
+  }
+  VISA_VectorOpnd *Dst = createDestination(BO, DstSigned, Mod, DstDesc);
+
+  VISA_VectorOpnd *Src0 = nullptr;
+  VISA_VectorOpnd *Src1 = nullptr;
+  VISA_PredOpnd *Pred = createPredFromWrRegion(DstDesc);
+
+  if (Opcode == ISA_INV) {
+    Src0 = createSourceOperand(BO, SrcSigned, 1, BI, Mod1); // source 0
+  } else {
+    Src0 = createSourceOperand(BO, SrcSigned, 0, BI);       // source 0
+    Src1 = createSourceOperand(BO, SrcSigned, 1, BI, Mod1); // source 1
+  }
+
+  auto ExecMask = getExecMaskFromWrRegion(DstDesc);
+
+  addDebugInfo();
+  if (IsLogic) {
+    CISA_CALL(Kernel->AppendVISALogicOrShiftInst(
+        Opcode, Pred, Mod, ExecMask, ExecSize, Dst, Src0, Src1, NULL, NULL));
+  } else {
+    if (Opcode == ISA_ADDC || Opcode == ISA_SUBB) {
+      CISA_CALL(Kernel->AppendVISAArithmeticInst(
+          Opcode, Pred, ExecMask, ExecSize, Dst, Src0, Src1, NULL));
+    } else {
+      CISA_CALL(Kernel->AppendVISAArithmeticInst(
+          Opcode, Pred, Mod, ExecMask, ExecSize, Dst, Src0, Src1, NULL));
+    }
+  }
+}
+
+/***********************************************************************
+ * buildBoolBinaryOperator : build code for a binary operator acting on
+ *                           i1 or vector of i1
+ *
+ * Enter:   BO = the BinaryOperator
+ */
+void GenXKernelBuilder::buildBoolBinaryOperator(BinaryOperator *BO) {
+  VISA_Exec_Size ExecSize = EXEC_SIZE_1;
+  if (VectorType *VT = dyn_cast<VectorType>(BO->getType()))
+    ExecSize = getExecSizeFromValue(VT->getNumElements());
+  ISA_Opcode Opcode = ISA_RESERVED_0;
+  switch (BO->getOpcode()) {
+  case Instruction::And:
+    Opcode = ISA_AND;
+    break;
+  case Instruction::Or:
+    Opcode = ISA_OR;
+    break;
+  case Instruction::Xor:
+    Opcode = ISA_XOR;
+    if (isNot(BO))
+      Opcode = ISA_NOT;
+    break;
+  default:
+    report_fatal_error(
+        "buildBoolBinaryOperator: unimplemented binary operator");
+    break;
+  }
+
+  VISA_PredVar *Dst = getPredicateVar(BO);
+  VISA_PredVar *Src0 = getPredicateVar(BO->getOperand(0));
+  VISA_PredVar *Src1 =
+      Opcode != ISA_NOT ? getPredicateVar(BO->getOperand(1)) : nullptr;
+
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISALogicOrShiftInst(
+      Opcode, NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1, ExecSize, Dst, Src0,
+      Src1));
+}
+
+void GenXKernelBuilder::buildSymbolInst(PtrToIntInst *ptr2Int, unsigned Mod,
+                                        const DstOpndDesc &DstDesc) {
+  auto GV = cast<GlobalValue>(ptr2Int->getOperand(0));
+  VISA_VectorOpnd *Dst = createDestination(ptr2Int, UNSIGNED, Mod, DstDesc);
+  CISA_CALL(Kernel->AppendVISACFSymbolInst(GV->getName().str(), Dst));
+}
+
+/***********************************************************************
+ * buildCastInst : build code for a cast (other than a bitcast)
+ *
+ * Enter:   CI = the CastInst
+ *          BI = BaleInfo for CI
+ *          Mod = modifier bits for destination
+ *          WrRegion = 0 else wrregion for destination
+ *          WrRegionBI = BaleInfo for WrRegion
+ */
+void GenXKernelBuilder::buildCastInst(CastInst *CI, BaleInfo BI, unsigned Mod,
+                                      const DstOpndDesc &DstDesc) {
+  Signedness InSigned = DONTCARESIGNED;
+  Signedness OutSigned = DONTCARESIGNED;
+  switch (CI->getOpcode()) {
+  case Instruction::UIToFP:
+    InSigned = UNSIGNED;
+    break;
+  case Instruction::SIToFP:
+    InSigned = SIGNED;
+    break;
+  case Instruction::FPToUI:
+    OutSigned = UNSIGNED;
+    break;
+  case Instruction::FPToSI:
+    OutSigned = SIGNED;
+    break;
+  case Instruction::ZExt:
+    InSigned = UNSIGNED;
+    break;
+  case Instruction::SExt:
+    InSigned = SIGNED;
+    break;
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+    break;
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+    break;
+  case Instruction::AddrSpaceCast:
+    break;
+  default:
+    report_fatal_error("buildCastInst: unimplemented cast");
+    break;
+  }
+
+  VISA_Exec_Size ExecSize = EXEC_SIZE_1;
+  if (VectorType *VT = dyn_cast<VectorType>(CI->getType()))
+    ExecSize = getExecSizeFromValue(VT->getNumElements());
+
+  auto ExecMask = getExecMaskFromWrRegion(DstDesc);
+
+  VISA_PredOpnd *Pred = createPredFromWrRegion(DstDesc);
+  // Give dest and source the same signedness for byte mov.
+  VISA_VectorOpnd *Dst = createDestination(CI, OutSigned, Mod, DstDesc);
+
+  if (InSigned == DONTCARESIGNED)
+    InSigned = OutSigned;
+  VISA_VectorOpnd *Src0 = createSourceOperand(CI, InSigned, 0, BI);
+
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISADataMovementInst(
+      ISA_MOV, Pred, Mod & MODIFIER_SAT, ExecMask, ExecSize, Dst, Src0, NULL));
+}
+
+/***********************************************************************
+ * buildCmp : build code for a compare
+ *
+ * Enter:   Cmp = the compare instruction
+ *          BI = BaleInfo for Cmp
+ *          WrRegion = 0 else wrpredregion, wrpredpredregion, or wrregion for
+ *          destination
+ */
+void GenXKernelBuilder::buildCmp(CmpInst *Cmp, BaleInfo BI,
+                                 const DstOpndDesc &DstDesc) {
+  assert((!DstDesc.WrRegion || Cmp->getType()->getPrimitiveSizeInBits() != 4 ||
+          Cmp->getOperand(0)
+                  ->getType()
+                  ->getScalarType()
+                  ->getPrimitiveSizeInBits() == 64) &&
+         "write predicate size 4 only allowed for double/longlong type");
+  Signedness Signed = DONTCARESIGNED;
+  VISA_Cond_Mod opSpec;
+  switch (Cmp->getPredicate()) {
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ:
+  case CmpInst::ICMP_EQ:
+    opSpec = ISA_CMP_E;
+    break;
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UNE:
+  case CmpInst::ICMP_NE:
+    opSpec = ISA_CMP_NE;
+    break;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT:
+    opSpec = ISA_CMP_G;
+    break;
+  case CmpInst::ICMP_UGT:
+    opSpec = ISA_CMP_G;
+    Signed = UNSIGNED;
+    break;
+  case CmpInst::ICMP_SGT:
+    opSpec = ISA_CMP_G;
+    Signed = SIGNED;
+    break;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE:
+    opSpec = ISA_CMP_GE;
+    break;
+  case CmpInst::ICMP_UGE:
+    opSpec = ISA_CMP_GE;
+    Signed = UNSIGNED;
+    break;
+  case CmpInst::ICMP_SGE:
+    opSpec = ISA_CMP_GE;
+    Signed = SIGNED;
+    break;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT:
+    opSpec = ISA_CMP_L;
+    break;
+  case CmpInst::ICMP_ULT:
+    opSpec = ISA_CMP_L;
+    Signed = UNSIGNED;
+    break;
+  case CmpInst::ICMP_SLT:
+    opSpec = ISA_CMP_L;
+    Signed = SIGNED;
+    break;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE:
+    opSpec = ISA_CMP_LE;
+    break;
+  case CmpInst::ICMP_ULE:
+    opSpec = ISA_CMP_LE;
+    Signed = UNSIGNED;
+    break;
+  case CmpInst::ICMP_SLE:
+    opSpec = ISA_CMP_LE;
+    Signed = SIGNED;
+    break;
+  default:
+    report_fatal_error("unknown predicate");
+    opSpec = ISA_CMP_E;
+    break;
+  }
+
+  // Check if this is to write to a predicate desination or a GRF desination.
+  bool WriteToPred = true;
+  if (Cmp->hasOneUse()) {
+    Instruction *UI = Cmp->user_back();
+    BaleInfo UserBI = Baling->getBaleInfo(UI);
+    if (UserBI.Type == BaleInfo::CMPDST)
+      WriteToPred = false;
+  }
+
+  VISA_Exec_Size ExecSize = EXEC_SIZE_1;
+  VISA_EMask_Ctrl ctrlMask = vISA_EMASK_M1;
+  if (VectorType *VT = dyn_cast<VectorType>(Cmp->getType()))
+    ExecSize = getExecSizeFromValue(VT->getNumElements());
+
+  VISA_VectorOpnd *Dst = nullptr;
+  genx::Signedness SignedSrc0;
+  VISA_VectorOpnd *Src0 =
+      createSourceOperand(Cmp, Signed, 0, BI, 0, &SignedSrc0);
+  VISA_VectorOpnd *Src1 = createSourceOperand(Cmp, SignedSrc0, 1, BI);
+
+  if (WriteToPred) {
+    ctrlMask = getExecMaskFromWrPredRegion(DstDesc.WrRegion, false);
+    VISA_PredVar *PredVar =
+        getPredicateVar(DstDesc.WrRegion ? DstDesc.WrRegion : Cmp);
+    addDebugInfo();
+    CISA_CALL(Kernel->AppendVISAComparisonInst(opSpec, ctrlMask, ExecSize,
+                                               PredVar, Src0, Src1));
+  } else {
+    ctrlMask = getExecMaskFromWrRegion(DstDesc);
+    Value *Val = DstDesc.WrRegion ? DstDesc.WrRegion : Cmp->user_back();
+    Dst = createDestination(Val, Signed, 0, DstDesc);
+    addDebugInfo();
+    CISA_CALL(Kernel->AppendVISAComparisonInst(opSpec, ctrlMask, ExecSize, Dst,
+                                               Src0, Src1));
+  }
+}
+
+/***********************************************************************
+ * buildConvertAddr : build code for conversion to address
+ *
+ * Enter:   CI = the CallInst
+ *          BI = BaleInfo for CI
+ *          Mod = modifier bits for destination
+ *          WrRegion = 0 else wrregion for destination
+ *          WrRegionBI = BaleInfo for WrRegion
+ */
+void GenXKernelBuilder::buildConvertAddr(CallInst *CI, genx::BaleInfo BI,
+                                         unsigned Mod,
+                                         const DstOpndDesc &DstDesc) {
+  assert(!DstDesc.WrRegion);
+  Value *Base = Liveness->getAddressBase(CI);
+  VISA_Exec_Size ExecSize = EXEC_SIZE_1;
+  VISA_EMask_Ctrl MaskCtrl = NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
+
+  if (VectorType *VT = dyn_cast<VectorType>(CI->getType()))
+    ExecSize = getExecSizeFromValue(VT->getNumElements());
+  // If the offset is less aligned than the base register element type, then
+  // we need a different type.
+  Type *OverrideTy = nullptr;
+  Type *BaseTy = Base->getType();
+  if (BaseTy->isPointerTy())
+    BaseTy = BaseTy->getPointerElementType();
+  unsigned ElementBytes =
+      BaseTy->getScalarType()->getPrimitiveSizeInBits() >> 3;
+  int Offset = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue();
+  if ((ElementBytes - 1) & Offset) {
+    OverrideTy = VectorType::get(Type::getInt8Ty(CI->getContext()),
+                                 BaseTy->getVectorNumElements() * ElementBytes);
+    ElementBytes = 1;
+  }
+  Register *BaseReg =
+      RegAlloc->getRegForValue(KernFunc, Base, DONTCARESIGNED, OverrideTy);
+
+  VISA_VectorOpnd *Dst = createAddressOperand(CI, true);
+  VISA_VectorOpnd *Src1 = nullptr;
+
+  if (BaseReg->Category == RegCategory::SURFACE ||
+      BaseReg->Category == RegCategory::SAMPLER) {
+    uint8_t offset = Offset >> 2;
+    switch (BaseReg->Category) {
+    case RegCategory::SURFACE: {
+      VISA_SurfaceVar *Decl = BaseReg->GetVar<VISA_SurfaceVar>(Kernel);
+      unsigned int offsetB = offset * 2; // 2 is bytes size of UW
+      CISA_CALL(Kernel->CreateVISAAddressOfOperand(Src1, Decl, offsetB));
+      break;
+    }
+    case RegCategory::SAMPLER: {
+      VISA_SurfaceVar *Decl = BaseReg->GetVar<VISA_SurfaceVar>(Kernel);
+      unsigned int offsetB = offset * 2; // 2 is bytes size of UW
+      CISA_CALL(Kernel->CreateVISAAddressOfOperand(Src1, Decl, offsetB));
+      break;
+    }
+    default:
+      report_fatal_error("Invalid state operand class: only surface, vme, and "
+                         "sampler are supported.");
+      break;
+    }
+  } else {
+    uint8_t rowOffset = Offset >> genx::log2(GrfByteSize);
+    uint8_t colOffset = (Offset & (GrfByteSize - 1)) >> Log2_32(ElementBytes);
+    VISA_GenVar *Decl = BaseReg->GetVar<VISA_GenVar>(Kernel);
+    auto TypeSize = BaseReg->Ty->getScalarType()->getPrimitiveSizeInBits() >> 3;
+    unsigned int offset = colOffset * TypeSize + rowOffset * GrfByteSize;
+    CISA_CALL(Kernel->CreateVISAAddressOfOperand(Src1, Decl, offset));
+  }
+  VISA_VectorOpnd *Src2 = createSourceOperand(CI, UNSIGNED, 0, BI);
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISAAddrAddInst(MaskCtrl, ExecSize, Dst, Src1, Src2));
+}
+
+/***********************************************************************
+ * buildAlloca : build code for allocating in thread-private memory
+ *
+ * Enter:   CI = the CallInst
+ *
+ */
+void GenXKernelBuilder::buildAlloca(CallInst *CI, unsigned IntrinID,
+                                    unsigned Mod, const DstOpndDesc &DstDesc) {
+  VISA_GenVar *Sp = nullptr;
+  CISA_CALL(Kernel->GetPredefinedVar(Sp, PreDefined_Vars::PREDEFINED_FE_SP));
+
+  VISA_VectorOpnd *SpSrc = nullptr;
+  CISA_CALL(
+      Kernel->CreateVISASrcOperand(SpSrc, Sp, MODIFIER_NONE, 0, 1, 0, 0, 0));
+
+  Value *AllocaOff = CI->getOperand(0);
+  Type *AllocaOffTy = AllocaOff->getType();
+  unsigned OffVal = getResultedTypeSize(AllocaOffTy);
+
+  VISA_VectorOpnd *Imm = nullptr;
+  CISA_CALL(Kernel->CreateVISAImmediate(Imm, &OffVal, ISA_TYPE_D));
+
+  if (IntrinID == llvm::GenXIntrinsic::genx_alloca) {
+    VISA_VectorOpnd *Src = nullptr;
+    CISA_CALL(Kernel->CreateVISASrcOperand(Src, static_cast<VISA_GenVar *>(Sp),
+                                           MODIFIER_NONE, 0, 1, 0, 0, 0));
+    VISA_VectorOpnd *Dst = createDestination(CI, DONTCARESIGNED, Mod, DstDesc);
+    CISA_CALL(Kernel->AppendVISADataMovementInst(
+        ISA_MOV, nullptr, false, vISA_EMASK_M1, EXEC_SIZE_1, Dst, Src));
+  }
+
+  VISA_VectorOpnd *DstSp = nullptr;
+  CISA_CALL(Kernel->CreateVISADstOperand(DstSp, static_cast<VISA_GenVar *>(Sp),
+                                         1, 0, 0));
+
+  CISA_CALL(Kernel->AppendVISAArithmeticInst(
+      ISA_ADD, nullptr, false, vISA_EMASK_M1, EXEC_SIZE_1, DstSp, SpSrc, Imm));
+}
+
+// extracts underlying c-string from provided constant
+static StringRef extractCStr(const Constant &CStrConst) {
+  if (isa<ConstantDataArray>(CStrConst))
+    return cast<ConstantDataArray>(CStrConst).getAsCString();
+  assert(isa<ConstantAggregateZero>(CStrConst));
+  return "";
+}
+
+/***********************************************************************
+ * buildPrintIndex : build code for storing constant format strins as metadata
+ *                   and returning idx for that string
+ *
+ * Enter:   CI = the CallInst
+ *
+ */
+void GenXKernelBuilder::buildPrintIndex(CallInst *CI, unsigned IntrinID,
+                                        unsigned Mod,
+                                        const DstOpndDesc &DstDesc) {
+  // create move with constant
+  VISA_VectorOpnd *Imm = nullptr;
+  Module* M = CI->getModule();
+  NamedMDNode *NMD = M->getOrInsertNamedMetadata("cm_print_strings");
+  unsigned NumOp  = NMD->getNumOperands();
+  CISA_CALL(Kernel->CreateVISAImmediate(Imm, &NumOp, ISA_TYPE_UD));
+  VISA_VectorOpnd *Dst = createDestination(CI, DONTCARESIGNED, Mod, DstDesc);
+  CISA_CALL(Kernel->AppendVISADataMovementInst(
+            ISA_MOV, nullptr, false, vISA_EMASK_M1_NM,
+            EXEC_SIZE_1, Dst, Imm));
+
+  // access string
+  LLVMContext& Context = CI->getContext();
+  ImmutableCallSite CallSite(CI);
+  const Value *Val = CallSite.getArgument(0);
+  const Instruction *Gep = cast<Instruction>(Val);
+  Val = Gep->getOperand(0);
+  StringRef UnderlyingCStr =
+    extractCStr(*cast<GlobalVariable>(Val)->getInitializer());
+
+  // store metadata
+  MDNode* N = MDNode::get(Context, MDString::get(Context, UnderlyingCStr));
+  NMD->addOperand(N);
+}
+
+void GenXKernelBuilder::deduceRegion(Region *R, bool IsDest,
+                                     unsigned MaxWidth) {
+  assert(Subtarget);
+  if (!IsDest && !R->is2D() && R->Indirect &&
+      Subtarget->hasIndirectGRFCrossing()) {
+    // For a source 1D indirect region that might possibly cross a GRF
+    // (because we are on SKL+ so a single GRF crossing is allowed), make it
+    // Nx1 instead of 1xN to avoid crossing a GRF within a row.
+    R->VStride = R->Stride;
+    R->Width = 1;
+    R->Stride = 0;
+  }
+  // another case of converting to <N;1,0> region format
+  if (!IsDest &&
+      (R->VStride == (int)R->Width * R->Stride || R->Width == R->NumElements)) {
+    R->Width = 1;
+    R->VStride = R->Stride;
+    R->Stride = 0;
+  } else if (R->Width > MaxWidth) {
+    // A Width of more than 16 (or whatever MaxWidth is) is not allowed. If it
+    // is more than 16, then legalization has ensured that either there is one
+    // row or the rows are contiguous (VStride == Width * Stride) and we can
+    // increase the number of rows.  (Note that Width and VStride are ignored
+    // in a destination operand; legalization ensures that there is only one
+    // row.)
+    R->Width = MaxWidth;
+    R->VStride = R->Width * R->Stride;
+  }
+
+  if (R->Width == R->NumElements) {
+    // Use VStride 0 on a 1D region. This is necessary for src0 in line or
+    // pln, so we may as well do it for everything.
+    R->VStride = 0;
+  }
+
+  if (R->Indirect) {
+    R->IndirectAddrOffset = 0;
+    if (GenXIntrinsic::isRdRegion(R->Indirect)) {
+      auto AddrRdR = cast<Instruction>(R->Indirect);
+      Region AddrR(AddrRdR, BaleInfo());
+      assert(!AddrR.Indirect &&
+             "cannot have address rdregion that is indirect");
+      R->IndirectAddrOffset =
+          AddrR.Offset / 2; // address element is always 2 byte
+    }
+  }
+}
+
+VISA_VectorOpnd *
+GenXKernelBuilder::createGeneralOperand(Region *R, VISA_GenVar *Decl,
+                                        Signedness Signed, unsigned Mod,
+                                        bool IsDest, unsigned MaxWidth) {
+  VISA_VectorOpnd *ResultOperand = nullptr;
+  // Write the vISA general operand, canonicalizing the
+  // region parameters where applicable.
+  assert(Decl && "no register allocated for this value");
+  if (!IsDest) {
+    ResultOperand = createCisaSrcOperand(
+        Decl, static_cast<VISA_Modifier>(Mod), R->VStride, R->Width, R->Stride,
+        R->Offset >> genx::log2(GrfByteSize),
+        (R->Offset & (GrfByteSize - 1)) / R->ElementBytes);
+  } else {
+    ResultOperand = createCisaDstOperand(
+        Decl, R->Stride, R->Offset >> genx::log2(GrfByteSize),
+        (R->Offset & (GrfByteSize - 1)) / R->ElementBytes);
+  }
+  return ResultOperand;
+}
+
+VISA_VectorOpnd *GenXKernelBuilder::createIndirectOperand(Region *R,
+                                                          Signedness Signed,
+                                                          unsigned Mod,
+                                                          bool IsDest,
+                                                          unsigned MaxWidth) {
+  VISA_VectorOpnd *ResultOperand = nullptr;
+  // Check if the indirect operand is a baled in rdregion.
+  Value *Indirect = R->Indirect;
+  if (GenXIntrinsic::isRdRegion(Indirect)) {
+    auto AddrRdR = cast<Instruction>(Indirect);
+    Indirect = AddrRdR->getOperand(0);
+  }
+  // Write the vISA indirect operand.
+  Register *IdxReg = RegAlloc->getRegForValue(KernFunc, Indirect, DONTCARESIGNED);
+  assert(IdxReg->Category == RegCategory::ADDRESS);
+
+  bool NotCrossGrf = !(R->Offset & (GrfByteSize - 1));
+  if (!NotCrossGrf) {
+    // Determine the NotCrossGrf bit setting (whether we can guarantee
+    // that adding an indirect region's constant offset does not cause
+    // a carry out of bit 4)
+    // by looking at the partial constant for the index
+    // before the constant is added on.
+    // This only works for a scalar index.
+    if (auto IndirInst = dyn_cast<Instruction>(R->Indirect)) {
+      auto A = AI.get(IndirInst);
+      unsigned Mask = (1U << std::min(5U, A.getLogAlign())) - 1;
+      if (Mask) {
+        if ((A.getExtraBits() & Mask) + (R->Offset & Mask) <= Mask &&
+            (unsigned)(R->Offset & (GrfByteSize - 1)) <= Mask) {
+          // The alignment and extrabits are such that adding R->Offset
+          // cannot cause a carry from bit 4 to bit 5.
+          NotCrossGrf = true;
+        }
+      }
+    }
+  }
+  visa::TypeDetails TD(Func->getParent()->getDataLayout(), R->ElementTy,
+                       Signed);
+  unsigned VStride = R->VStride;
+  if (isa<VectorType>(R->Indirect->getType()))
+    // multi indirect (vector index), set vstride
+    VStride = 0x8000; // field to null
+  VISA_AddrVar *AddrDecl = IdxReg->GetVar<VISA_AddrVar>(Kernel);
+  if (IsDest) {
+    CISA_CALL(Kernel->CreateVISAIndirectDstOperand(
+        ResultOperand, AddrDecl, R->IndirectAddrOffset, R->Offset, R->Stride,
+        (VISA_Type)TD.VisaType));
+  } else {
+    CISA_CALL(Kernel->CreateVISAIndirectSrcOperand(
+        ResultOperand, AddrDecl, static_cast<VISA_Modifier>(Mod),
+        R->IndirectAddrOffset, R->Offset, VStride, R->Width, R->Stride,
+        (VISA_Type)TD.VisaType));
+  }
+  return ResultOperand;
+}
+
+
+/***********************************************************************
+ * createRegionOperand : create a vISA region operand
+ *
+ * Enter:   R = Region
+ *          RegNum = vISA register number (ignored if region is indirect)
+ *          Signed = whether signed or unsigned required (only used for
+ *                   indirect operand)
+ *          Mod = modifiers
+ *          IsDest = true if destination operand
+ *          MaxWidth = maximum width (used to stop TWICEWIDTH operand
+ *                     getting a width bigger than the execution size, but
+ *                     for other uses defaults to 16)
+ */
+VISA_VectorOpnd *
+GenXKernelBuilder::createRegionOperand(Region *R, VISA_GenVar *Decl,
+                                       Signedness Signed, unsigned Mod,
+                                       bool IsDest, unsigned MaxWidth) {
+  deduceRegion(R, IsDest, MaxWidth);
+
+  if (R->Indirect)
+    return createIndirectOperand(R, Signed, Mod, IsDest, MaxWidth);
+  else
+    return createGeneralOperand(R, Decl, Signed, Mod, IsDest, MaxWidth);
+}
+
+
+bool GenXKernelBuilder::isInLoop(BasicBlock *BB) {
+  if (getLoops(BB->getParent())->getLoopFor(BB))
+    return true; // inside loop in this function
+  // Now we need to see if this function is called from inside a loop.
+  // First check the cache.
+  auto i = IsInLoopCache.find(BB->getParent());
+  if (i != IsInLoopCache.end())
+    return i->second;
+  // Now check all call sites. This recurses as deep as the depth of the call
+  // graph, which must be acyclic as GenX does not allow recursion.
+  bool InLoop = false;
+  for (auto ui = BB->getParent()->use_begin(), ue = BB->getParent()->use_end();
+       ui != ue; ++ui) {
+    auto CI = dyn_cast<CallInst>(ui->getUser());
+    if (!CI)
+      continue;
+    assert(ui->getOperandNo() == CI->getNumArgOperands());
+    if (CI->getFunction() == BB->getParent())
+      continue;
+    if (isInLoop(CI->getParent())) {
+      InLoop = true;
+      break;
+    }
+  }
+  IsInLoopCache[BB->getParent()] = InLoop;
+  return InLoop;
+}
+
+void GenXKernelBuilder::addWriteRegionLifetimeStartInst(Instruction *WrRegion) {
+  if (!GenXIntrinsic::isWrRegion(WrRegion))
+    return; // No lifetime start for wrpredregion.
+  // See if the wrregion is in a loop.
+  auto BB = WrRegion->getParent();
+  if (!isInLoop(BB))
+    return; // not in loop
+  // See if the wrregion is the first of a sequence in the same basic block
+  // that together write the whole register. We assume that each region is
+  // contiguous, and the regions are written in ascending offset order, as
+  // that is what legalization does if the original write was to the whole
+  // register.
+  unsigned NumElementsSoFar = 0;
+  unsigned TotalNumElements = WrRegion->getType()->getVectorNumElements();
+  Instruction *ThisWr = WrRegion;
+  for (;;) {
+    Region R(ThisWr, BaleInfo());
+    if (R.Indirect)
+      break;
+    if ((unsigned)R.Offset != NumElementsSoFar * R.ElementBytes)
+      break;
+    if (R.Stride != 1 && R.Width != 1)
+      break;
+    if (R.Width != R.NumElements)
+      break;
+    NumElementsSoFar += R.NumElements;
+    if (NumElementsSoFar == TotalNumElements)
+      return; // whole register is written
+    // Go on to next wrregion in the same basic block if any.
+    if (!ThisWr->hasOneUse())
+      break;
+    ThisWr = cast<Instruction>(ThisWr->use_begin()->getUser());
+    if (!GenXIntrinsic::isWrRegion(ThisWr))
+      break;
+    if (ThisWr->getParent() != BB)
+      break;
+  }
+  // The wrregion is in a loop and is not the first in a sequence in the same
+  // basic block that writes the whole register. Write a lifetime start.
+  addLifetimeStartInst(WrRegion);
+}
+
+/**************************************************************************************************
+ * addLifetimeStartInst : add a lifetime.start instruction
+ *
+ * Enter:   Inst = value to use in lifetime.start
+ */
+void GenXKernelBuilder::addLifetimeStartInst(Instruction *Inst) {
+  VISA_VectorOpnd *opnd = nullptr;
+  auto Reg = RegAlloc->getRegForValueOrNull(KernFunc, Inst);
+  if (!Reg)
+    return; // no register allocated such as being indirected.
+
+  switch (Reg->Category) {
+  case RegCategory::GENERAL:
+    opnd = createCisaDstOperand(Reg->GetVar<VISA_GenVar>(Kernel), 1, 0, 0);
+    break;
+  case RegCategory::ADDRESS:
+    CISA_CALL(Kernel->CreateVISAAddressDstOperand(
+        opnd, Reg->GetVar<VISA_AddrVar>(Kernel), 0));
+    break;
+#if 0  // Not currently used.
+    case RegCategory::PREDICATE:
+      break;
+#endif // 0
+  default:
+    report_fatal_error("createLifetimeStartInst: Invalid register category");
+    break;
+  }
+  addDebugInfo();
+  CISA_CALL(Kernel->AppendVISALifetime(LIFETIME_START, opnd));
+}
+
+/***********************************************************************
+ * addDebugInfo : add debug infromation
+ */
+void GenXKernelBuilder::addDebugInfo() {
+  // Ensure that the last label does not get merged with the next one now we
+  // know that there is code in between.
+  LastLabel = -1;
+  // Check if we have a pending debug location.
+  if (PendingLine) {
+    // Do the source location debug info with vISA FILE and LOC instructions.
+    if (PendingFilename != "" && (PendingFilename != LastFilename ||
+                                  PendingDirectory != LastDirectory)) {
+      SmallString<256> Filename;
+      // Bodge here to detect Windows absolute path even when built on cygwin.
+      if (sys::path::is_absolute(PendingFilename) ||
+          (PendingFilename.size() > 2 && PendingFilename[1] == ':'))
+        Filename = PendingFilename;
+      else {
+        Filename = PendingDirectory;
+        sys::path::append(Filename, PendingFilename);
+      }
+      CISA_CALL(Kernel->AppendVISAMiscFileInst(Filename.c_str()));
+      LastDirectory = PendingDirectory;
+      LastFilename = PendingFilename;
+    }
+    if (PendingLine != LastLine) {
+      CISA_CALL(Kernel->AppendVISAMiscLOC(PendingLine));
+      LastLine = PendingLine;
+      PendingLine = 0;
+    }
+  }
+}
+
+void GenXKernelBuilder::emitOptimizationHints() {
+  if (skipOptWithLargeBlock(*FG))
+    return;
+
+  // Track rp considering byte variable widening.
+  PressureTracker RP(*FG, Liveness, /*ByteWidening*/ true);
+  const std::vector<genx::LiveRange *> &WidenLRs = RP.getWidenVariables();
+
+  for (auto LR : WidenLRs) {
+    SimpleValue SV = *LR->value_begin();
+    auto *R = RegAlloc->getRegForValueOrNull(FG->getHead(), SV);
+    // This variable is being used in or crossing a high register pressure
+    // region. Set an optimization hint not to widen it.
+    if (R && RP.intersectWithRedRegion(LR)) {
+      R->addAttribute(addStringToPool("NoWidening"), "");
+      RP.decreasePressure(LR);
+    }
+  }
+}
+
+/***********************************************************************
+ * addLabelInst : add a label instruction for a basic block or join
+ */
+void GenXKernelBuilder::addLabelInst(Value *BB) {
+  // Skip this for now, because we don't know how to patch labels of branches.
+  if (0) { // LastLabel >= 0) {
+    // There has been no code since the last label, so use the same label
+    // for this basic block.
+    setLabel(BB, LastLabel);
+  } else {
+    // Need a new label.
+    LastLabel = getOrCreateLabel(BB, LABEL_BLOCK);
+    CISA_CALL(Kernel->AppendVISACFLabelInst(Labels[LastLabel]));
+  }
+}
+
+/***********************************************************************
+ * getOrCreateLabel : get/create label number for a Function or BasicBlock
+ */
+unsigned GenXKernelBuilder::getOrCreateLabel(Value *V, int Kind) {
+  int Num = getLabel(V);
+  if (Num >= 0)
+    return Num;
+  Num = Labels.size();
+  setLabel(V, Num);
+  VISA_LabelOpnd *Decl = nullptr;
+
+  // Replicate the functionality of the old compiler and make the first label
+  // for a function contain the name (makes sure the function label is unique)
+  // It's not clear this is strictly necessary any more (but doesn't do any
+  // harm and may even make reading the intermediate forms easier)
+  if (Kind == LABEL_SUBROUTINE) {
+    StringRef N = TheKernelMetadata.getName();
+    std::string NameBuf;
+    if (V != FG->getHead()) {
+      // This is a subroutine, not the kernel/function at the head of the
+      // FunctionGroup. Use the name of the subroutine.
+      N = V->getName();
+    } else {
+      // For a kernel/function name, fix illegal characters. The jitter uses
+      // the same name for the label in the .asm file, and aubload does not
+      // like the illegal characters.
+      NameBuf = legalizeName(N);
+      N = NameBuf;
+    }
+    CISA_CALL(Kernel->CreateVISALabelVar(
+        Decl,
+        cutString((Twine(N) + Twine("_BB_") + Twine(Labels.size())).str())
+            .c_str(),
+        VISA_Label_Kind(Kind)));
+    Labels.push_back(Decl);
+  } else if (Kind == LABEL_BLOCK) {
+    CISA_CALL(Kernel->CreateVISALabelVar(
+        Decl, cutString((Twine("BB_") + Twine(Labels.size())).str()).c_str(),
+        VISA_Label_Kind(Kind)));
+    Labels.push_back(Decl);
+  } else if (Kind == LABEL_FC) {
+    assert(isa<Function>(V));
+    auto F = cast<Function>(V);
+    StringRef N = F->getFnAttribute("CMCallable").getValueAsString();
+    CISA_CALL(Kernel->CreateVISALabelVar(
+        Decl, cutString(Twine(N).str()).c_str(), VISA_Label_Kind(Kind)));
+    Labels.push_back(Decl);
+  } else {
+    StringRef N = V->getName();
+    CISA_CALL(Kernel->CreateVISALabelVar(
+        Decl,
+        cutString(
+            (Twine("_") + Twine(N) + Twine("_") + Twine(Labels.size())).str())
+            .c_str(),
+        VISA_Label_Kind(Kind)));
+    Labels.push_back(Decl);
+  }
+  return Num;
+}
+
+void GenXKernelBuilder::buildInlineAsm(CallInst *CI) {
+  assert(CI->isInlineAsm() && "Inline asm expected");
+  InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
+  std::string AsmStr(IA->getAsmString());
+  std::stringstream &AsmTextStream = CisaBuilder->GetAsmTextStream();
+
+  // Nothing to substitute if no constraints provided
+  if (IA->getConstraintString().empty()) {
+    AsmTextStream << AsmStr << std::endl;
+    return;
+  }
+
+  unsigned NumOutputs = genx::getInlineAsmNumOutputs(CI);
+  auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI);
+
+  // Scan asm string in reverse direction to match larger numbers first
+  for (int ArgNo = ConstraintsInfo.size() - 1; ArgNo >= 0; ArgNo--) {
+    // Regexp to match number of operand
+    Regex R("\\$+" + llvm::to_string(ArgNo));
+    if (!R.match(AsmStr))
+      continue;
+    // Operand that must be substituded into inline assembly string
+    Value *InlasmOp = nullptr;
+    std::string InlasmOpAsString;
+    // For output collect destination descriptor with
+    // baling info and WrRegion instruction
+    DstOpndDesc DstDesc;
+    auto Info = ConstraintsInfo[ArgNo];
+    if (Info.isOutput()) {
+      // If result is a struct than inline assembly
+      // instruction has multiple outputs
+      if (isa<StructType>(CI->getType())) {
+        // Go through all users of a result and find extractelement with
+        // ArgNo indice: ArgNo is a number of a constraint in constraint
+        // list
+        for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) {
+          auto EV = dyn_cast<ExtractValueInst>(ui->getUser());
+          if (EV && (EV->getIndices()[0] == ArgNo)) {
+            InlasmOp = EV;
+            break;
+          }
+        }
+      } else
+        // Single output
+        InlasmOp = CI;
+
+      if (InlasmOp) {
+        Instruction *Inst = cast<Instruction>(InlasmOp);
+        Instruction *Head = Baling->getBaleHead(Inst);
+        BaleInfo BI = Baling->getBaleInfo(Head);
+        // If head is g_store than change head to store's
+        //  operand and check if it's baled wrr
+        if (BI.Type == BaleInfo::GSTORE) {
+          DstDesc.GStore = Head;
+          Head = cast<Instruction>(Head->getOperand(0));
+          BI = Baling->getBaleInfo(Head);
+        }
+        if (BI.Type == BaleInfo::WRREGION) {
+          DstDesc.WrRegion = Head;
+          DstDesc.WrRegionBI = BI;
+        }
+        InlasmOpAsString = createInlineAsmDestinationOperand(
+            InlasmOp, DONTCARESIGNED, Info.getConstraintType(), 0, DstDesc);
+      } else {
+        // Can't deduce output operand because there are no users
+        // but we have register allocated. If region is needed we can use
+        // default one based one type.
+        SimpleValue SV(CI, ArgNo);
+        Register *Reg = RegAlloc->getRegForValue(KernFunc, SV, DONTCARESIGNED);
+        Region R(SV.getType());
+        InlasmOpAsString =
+            createInlineAsmOperand(Reg, &R, true /*IsDst*/, DONTCARESIGNED,
+                                   Info.getConstraintType(), 0);
+      }
+    } else {
+      // Input of inline assembly
+      InlasmOp = CI->getArgOperand(ArgNo - NumOutputs);
+      bool IsBaled = false;
+      if (GenXIntrinsic::isRdRegion(InlasmOp)) {
+        Instruction *RdR = cast<Instruction>(InlasmOp);
+        IsBaled = Baling->isBaled(RdR);
+      }
+      InlasmOpAsString = createInlineAsmSourceOperand(
+          InlasmOp, DONTCARESIGNED, IsBaled, Info.getConstraintType());
+    }
+    // Substitute string name of the variable until
+    // there are no possible sustitutions. Do-while
+    // since first match was checked in the beginning
+    // of the loop.
+    do {
+      AsmStr = R.sub(InlasmOpAsString, AsmStr);
+    } while (R.match(AsmStr));
+  }
+
+  AsmTextStream << "\n// INLASM BEGIN\n"
+                << AsmStr << "\n// INLASM END\n"
+                << std::endl;
+}
+
+void GenXKernelBuilder::buildCall(IGCLLVM::CallInst *CI,
+                                  const DstOpndDesc &DstDesc) {
+  LLVM_DEBUG(dbgs() << CI << "\n");
+  Function *Callee = CI->getCalledFunction();
+
+  if (!Callee || Callee->hasFnAttribute(genx::FunctionMD::CMStackCall)) {
+    buildStackCall(CI, DstDesc);
+    return;
+  }
+
+  unsigned LabelKind = LABEL_SUBROUTINE;
+  if (Callee->hasFnAttribute("CMCallable"))
+    LabelKind = LABEL_FC;
+  else
+    assert(FG == FG->getParent()->getGroup(Callee) &&
+           "unexpected call to outside FunctionGroup");
+
+  // Check whether the called function has a predicate arg that is EM.
+  int EMOperandNum = -1;
+  for (auto ai = Callee->arg_begin(), ae = Callee->arg_end(); ai != ae; ++ai) {
+    auto Arg = &*ai;
+    if (!Arg->getType()->getScalarType()->isIntegerTy(1))
+      continue;
+    if (Liveness->getLiveRange(Arg)->getCategory() == RegCategory::EM) {
+      EMOperandNum = Arg->getArgNo();
+      break;
+    }
+  }
+
+  if (EMOperandNum < 0) {
+    addDebugInfo();
+    // Scalar calls must be marked with NoMask
+    CISA_CALL(Kernel->AppendVISACFCallInst(
+        nullptr, vISA_EMASK_M1_NM, EXEC_SIZE_1,
+        Labels[getOrCreateLabel(Callee, LabelKind)]));
+  } else {
+    auto PredicateOpnd = NoMask ? nullptr : createPred(CI, BaleInfo(), EMOperandNum);
+    addDebugInfo();
+    CISA_CALL(Kernel->AppendVISACFCallInst(
+        PredicateOpnd, vISA_EMASK_M1,
+        getExecSizeFromValue(
+            CI->getArgOperand(EMOperandNum)->getType()->getVectorNumElements()),
+        Labels[getOrCreateLabel(Callee, LabelKind)]));
+  }
+}
+
+void GenXKernelBuilder::buildRet(ReturnInst *RI) {
+  uint32_t FloatControl = 0;
+  auto F = RI->getFunction();
+  F->getFnAttribute(genx::FunctionMD::CMFloatControl)
+      .getValueAsString()
+      .getAsInteger(0, FloatControl);
+  FloatControl &= CR_Mask;
+  if (FloatControl != DefaultFloatControl) {
+    buildControlRegUpdate(CR_Mask, true);
+    if (DefaultFloatControl)
+      buildControlRegUpdate(DefaultFloatControl, false);
+  }
+  addDebugInfo();
+  if (!isKernel(F) &&
+      (F->hasFnAttribute(genx::FunctionMD::CMStackCall) ||
+       F->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly))) {
+    CISA_CALL(Kernel->AppendVISACFFunctionRetInst(nullptr, vISA_EMASK_M1,
+                                                  EXEC_SIZE_16));
+  } else {
+    CISA_CALL(Kernel->AppendVISACFRetInst(nullptr, vISA_EMASK_M1, EXEC_SIZE_1));
+  }
+}
+
+/***********************************************************************
+ * createRawSourceOperand : create raw source operand of instruction
+ *
+ * Enter:   Inst = instruction to get source operand from
+ *          OperandNum = operand number
+ *          BI = BaleInfo for Inst (so we can tell whether a rdregion
+ *                  or modifier is bundled in)
+ */
+VISA_RawOpnd *GenXKernelBuilder::createRawSourceOperand(Instruction *Inst,
+                                                        unsigned OperandNum,
+                                                        BaleInfo BI,
+                                                        Signedness Signed) {
+  VISA_RawOpnd *ResultOperand = nullptr;
+  Value *V = Inst->getOperand(OperandNum);
+  if (isa<UndefValue>(V)) {
+    CISA_CALL(Kernel->CreateVISANullRawOperand(ResultOperand, false));
+  } else {
+    unsigned ByteOffset = 0;
+    if (Baling->getBaleInfo(Inst).isOperandBaled(OperandNum)) {
+      Instruction *RdRegion = cast<Instruction>(V);
+      Region R(RdRegion, BaleInfo());
+      ByteOffset = R.Offset;
+      V = RdRegion->getOperand(0);
+    }
+    Register *Reg = RegAlloc->getRegForValue(KernFunc, V, Signed);
+    assert(Reg->Category == RegCategory::GENERAL);
+    CISA_CALL(Kernel->CreateVISARawOperand(
+        ResultOperand, Reg->GetVar<VISA_GenVar>(Kernel), ByteOffset));
+  }
+  return ResultOperand;
+}
+
+/***********************************************************************
+ * createRawDestination : create raw destination operand
+ *
+ * Enter:   Inst = destination value
+ *          WrRegion = 0 else wrregion that destination is baled into
+ *
+ * A raw destination can be baled into a wrregion, but only if the region
+ * is direct and its start index is GRF aligned.
+ */
+VISA_RawOpnd *
+GenXKernelBuilder::createRawDestination(Value *V, const DstOpndDesc &DstDesc,
+                                        Signedness Signed) {
+  VISA_RawOpnd *ResultOperand = nullptr;
+  unsigned ByteOffset = 0;
+  if (DstDesc.WrRegion) {
+    V = DstDesc.WrRegion;
+    Region R(DstDesc.WrRegion, BaleInfo());
+    ByteOffset = R.Offset;
+  }
+  Type *OverrideType = nullptr;
+  if (DstDesc.GStore) {
+    V = getUnderlyingGlobalVariable(DstDesc.GStore->getOperand(1));
+    assert(V && "out of sync");
+    OverrideType = DstDesc.GStore->getOperand(0)->getType();
+  }
+  Register *Reg = RegAlloc->getRegForValueOrNull(KernFunc, V, Signed, OverrideType);
+  if (!Reg) {
+    // No register assigned. This happens to an unused raw result where the
+    // result is marked as RAW_NULLALLOWED in GenXIntrinsics.
+    CISA_CALL(Kernel->CreateVISANullRawOperand(ResultOperand, true));
+  } else {
+    assert(Reg->Category == RegCategory::GENERAL);
+    CISA_CALL(Kernel->CreateVISARawOperand(
+        ResultOperand, Reg->GetVar<VISA_GenVar>(Kernel), ByteOffset));
+  }
+  return ResultOperand;
+}
+
+/***********************************************************************
+ * getLabel : get label number for a Function or BasicBlock
+ *
+ * Return:  label number, -1 if none found
+ */
+int GenXKernelBuilder::getLabel(Value *V) {
+  std::map<Value *, unsigned>::iterator i = LabelMap.find(V);
+  if (i != LabelMap.end())
+    return i->second;
+  return -1;
+}
+
+/***********************************************************************
+ * setLabel : set the label number for a Function or BasicBlock
+ */
+void GenXKernelBuilder::setLabel(Value *V, unsigned Num) { LabelMap[V] = Num; }
+
+unsigned GenXKernelBuilder::addStringToPool(StringRef Str) {
+  auto val = std::pair<std::string, unsigned>(Str.begin(), StringPool.size());
+  auto Res = StringPool.insert(val);
+  return Res.first->second;
+}
+
+StringRef GenXKernelBuilder::getStringByIndex(unsigned Val) {
+  for (const auto &it : StringPool) {
+    if (it.second == Val)
+      return it.first;
+  }
+  llvm_unreachable("Can't find string by index.");
+}
+
+/***********************************************************************
+ * GenXKernelBuilder::getLoops : get loop info for given function, cacheing in
+ *      Loops map
+ */
+LoopInfoBase<BasicBlock, Loop> *GenXKernelBuilder::getLoops(Function *F) {
+  auto LoopsEntry = &Loops[F];
+  if (!*LoopsEntry) {
+    auto DT = DTs->getDomTree(F);
+    *LoopsEntry = new LoopInfoBase<BasicBlock, Loop>;
+    (*LoopsEntry)->analyze(*DT);
+  }
+  return *LoopsEntry;
+}
+
+/***********************************************************************
+ * Get size of the argument of type 'type' in bytes considering layout of
+ * subtypes of aggregate type in units of size 'mod'
+ * mod is typically 32 (GRF) or 16 (oword)
+ */
+unsigned GenXKernelBuilder::getValueSize(Type *T, unsigned Mod) const {
+  unsigned Result = 0;
+  if (T->isAggregateType()) {
+    for (unsigned i = 0; i < T->getStructNumElements(); i++) {
+      Result += getValueSize(T->getContainedType(i)) / Mod +
+                (getValueSize(T->getContainedType(i)) % Mod ? 1 : 0);
+    }
+    Result *= Mod;
+  } else
+    Result = FG->getModule()->getDataLayout().getTypeSizeInBits(T) / 8;
+  return Result;
+}
+
+unsigned GenXKernelBuilder::getFuncArgsSize(llvm::Function *F) {
+  unsigned Result = 0;
+  for (auto &Arg : F->args())
+    Result += getValueSize(&Arg);
+  return Result;
+}
+
+GenericCisaVariable *
+GenXKernelBuilder::createCisaVariable(VISAKernel *Kernel, const char *Name,
+                                      VISA_GenVar *AliasVar,
+                                      unsigned ByteSize) {
+  auto it = CisaVars[Kernel].find(Name);
+  if (it != CisaVars[Kernel].end())
+    it->second = GenericCisaVariable(Name, AliasVar, ByteSize);
+  else
+    CisaVars[Kernel].insert(
+        std::make_pair(Name, GenericCisaVariable(Name, AliasVar, ByteSize)));
+  return &(CisaVars[Kernel].at(Name));
+}
+
+static unsigned deduceByteSize(Value *V, const DataLayout &DL) {
+  return DL.getTypeSizeInBits(V->getType()->getScalarType()) / 8;
+}
+
+static unsigned deduceByteSize(CisaVariable *V, const DataLayout &DL) {
+  assert(V->getType() < ISA_TYPE_NUM);
+  return CISATypeTable[V->getType()].typeSize;
+}
+
+/**************************************************************************************************
+ * emitVectorCopy : emit vISA that performs copying form Dst to Src
+ *
+ * Emit sufficient amount of MOVs from Dst to Src picking size in a greedy manner
+ *
+ * T1 and T2 should be llvm::Value and CisaVariable or vice-versa,
+ * CisaVariable=>CisaVariable or Value=>Value copying is not supported here
+ *
+ */
+template <typename T1, typename T2>
+void GenXKernelBuilder::emitVectorCopy(T1 *Dst, T2 *Src, unsigned &RowOff,
+                                       unsigned &ColOff, unsigned &SrcRowOff,
+                                       unsigned &SrcColOff, int TotalSize,
+                                       bool DoCopy) {
+  auto partCopy = [&](int Sz) {
+    int ByteSz = Sz * deduceByteSize(Dst, DL);
+    assert(ByteSz);
+
+    unsigned Start = SrcRowOff;
+    unsigned End =
+        (SrcRowOff * getGRFSize() + SrcColOff + ByteSz) / getGRFSize();
+
+    // mov is prohibited to span across >2 GRF
+    if (End - Start >= 2) {
+      assert(Sz > 1);
+      return;
+    }
+
+    while (TotalSize >= ByteSz) {
+      VISA_VectorOpnd *ArgSrc = nullptr, *ArgDst = nullptr;
+      unsigned Offset = SrcRowOff * GrfByteSize + SrcColOff;
+      ArgSrc = createSource(Src, UNSIGNED, Sz, &Offset);
+      SrcRowOff += (SrcColOff + ByteSz) / GrfByteSize;
+      SrcColOff = (SrcColOff + ByteSz) % GrfByteSize;
+
+      Offset = RowOff * GrfByteSize + ColOff;
+      ArgDst = createDestination(Dst, UNSIGNED, &Offset);
+      RowOff += (ColOff + ByteSz) / GrfByteSize;
+      ColOff = (ColOff + ByteSz) % GrfByteSize;
+
+      if (DoCopy)
+        CISA_CALL(Kernel->AppendVISADataMovementInst(
+            ISA_MOV, nullptr, false,
+            (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+            getExecSizeFromValue(Sz), ArgDst, ArgSrc));
+      TotalSize -= ByteSz;
+    }
+  };
+  partCopy(16);
+  partCopy(8);
+  partCopy(4);
+  partCopy(2);
+  partCopy(1);
+}
+
+void GenXKernelBuilder::pushStackArg(VISA_StateOpndHandle *Dst, Value *Src,
+                                     int TotalSz, unsigned &RowOff,
+                                     unsigned &ColOff, unsigned &SrcRowOff,
+                                     unsigned &SrcColOff, bool DoCopy) {
+  VISA_GenVar *StackOff = nullptr, *Sp = nullptr;
+
+  auto StackTmp = createCisaVariable(Kernel, "stackTmp", nullptr, TotalSz);
+
+  auto TmpType = llvmToVisaType(Src->getType());
+  auto TmpVar = StackTmp->getAlias(TmpType, Kernel);
+
+  CISA_CALL(Kernel->CreateVISAGenVar(StackOff, "stackOff", 1, ISA_TYPE_UQ,
+                                     ALIGN_OWORD));
+  unsigned RawOff = 0;
+  auto partCopy = [&](int Sz) {
+    // TODO: mb we have some constant for oword size
+    int ByteSz = Sz * BYTES_PER_OWORD;
+    int CopySz = std::min(ByteSz, TotalSz);
+
+    while (TotalSz - ByteSz >= 0 || (TotalSz > 0 && Sz == 1)) {
+      CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP));
+      VISA_VectorOpnd *SpOpSrc1 = nullptr;
+      VISA_VectorOpnd *SpOpSrc2 = nullptr;
+      VISA_VectorOpnd *SpOpDst = nullptr;
+      CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0));
+      CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc1, Sp, MODIFIER_NONE, 0, 1,
+                                             0, 0, 0));
+      CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc2, Sp, MODIFIER_NONE, 0, 1,
+                                             0, 0, 0));
+
+      VISA_VectorOpnd *TmpOffDst = nullptr, *TmpOffSrc = nullptr;
+      CISA_CALL(Kernel->CreateVISADstOperand(TmpOffDst, StackOff, 1, 0, 0));
+      CISA_CALL(Kernel->CreateVISASrcOperand(TmpOffSrc, StackOff, MODIFIER_NONE,
+                                             0, 1, 0, 0, 0));
+
+      emitVectorCopy(TmpVar, Src, RowOff, ColOff, SrcRowOff, SrcColOff, CopySz,
+                     DoCopy);
+      VISA_VectorOpnd *Imm = nullptr;
+      unsigned OffVal = Sz;
+      if (Subtarget->useGlobalMem())
+        OffVal *= BYTES_PER_OWORD;
+      CISA_CALL(Kernel->CreateVISAImmediate(Imm, &OffVal, ISA_TYPE_UD));
+      VISA_RawOpnd *RawSrc = nullptr;
+      CISA_CALL(
+          Kernel->CreateVISARawOperand(RawSrc, TmpVar->getGenVar(), RawOff));
+      RawOff += Sz * BYTES_PER_OWORD;
+
+      if (DoCopy) {
+        CISA_CALL(Kernel->AppendVISADataMovementInst(ISA_MOV, nullptr, false,
+                                                     vISA_EMASK_M1, EXEC_SIZE_1,
+                                                     TmpOffDst, SpOpSrc1));
+        if (Subtarget->useGlobalMem()) {
+          CISA_CALL(Kernel->AppendVISASvmBlockStoreInst(
+              getCisaOwordNumFromNumber(Sz), true, TmpOffSrc, RawSrc));
+        } else {
+          CISA_CALL(Kernel->AppendVISASurfAccessOwordLoadStoreInst(
+              ISA_OWORD_ST, vISA_EMASK_M1, Dst, getCisaOwordNumFromNumber(Sz),
+              TmpOffSrc, RawSrc));
+        }
+      }
+      CISA_CALL(Kernel->AppendVISAArithmeticInst(ISA_ADD, nullptr, false,
+                                                 vISA_EMASK_M1, EXEC_SIZE_1,
+                                                 SpOpDst, SpOpSrc2, Imm));
+      TotalSz -= ByteSz;
+    }
+  };
+
+  partCopy(8);
+  partCopy(4);
+  partCopy(2);
+  partCopy(1);
+}
+
+void GenXKernelBuilder::popStackArg(llvm::Value *Dst, VISA_StateOpndHandle *Src,
+                                    int TotalSz, unsigned &RowOff,
+                                    unsigned &ColOff, unsigned &SrcRowOff,
+                                    unsigned &SrcColOff, int &PrevStackOff) {
+  VISA_GenVar *StackOff = nullptr, *Sp = nullptr;
+
+  auto StackTmp = createCisaVariable(Kernel, "stackTmp", nullptr, TotalSz);
+
+  auto TmpType = llvmToVisaType(Dst->getType());
+  auto TmpVar = StackTmp->getAlias(TmpType, Kernel);
+
+  CISA_CALL(Kernel->CreateVISAGenVar(StackOff, "stackOff", 1, ISA_TYPE_UQ,
+                                     ALIGN_OWORD));
+  auto partCopy = [&](int Sz) {
+    // TODO: mb we have some constant for oword size
+    int ByteSz = Sz * BYTES_PER_OWORD;
+    while (TotalSz - ByteSz >= 0 || (TotalSz > 0 && Sz == 1)) {
+      CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP));
+      VISA_VectorOpnd *SpOpSrc = nullptr;
+      CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1,
+                                             0, 0, 0));
+
+      VISA_VectorOpnd *TmpOffDst = nullptr;
+      VISA_VectorOpnd *TmpOffSrc = nullptr;
+      CISA_CALL(Kernel->CreateVISADstOperand(TmpOffDst, StackOff, 1, 0, 0));
+      CISA_CALL(Kernel->CreateVISASrcOperand(TmpOffSrc, StackOff, MODIFIER_NONE,
+                                             0, 1, 0, 0, 0));
+
+      VISA_VectorOpnd *Imm = nullptr;
+      int OffVal = PrevStackOff;
+      if (Subtarget->useGlobalMem())
+        OffVal *= BYTES_PER_OWORD;
+      CISA_CALL(Kernel->CreateVISAImmediate(Imm, &OffVal, ISA_TYPE_UD));
+      PrevStackOff += Sz;
+      VISA_RawOpnd *RawSrc = nullptr;
+      CISA_CALL(Kernel->CreateVISARawOperand(RawSrc, TmpVar->getGenVar(), 0));
+
+      CISA_CALL(Kernel->AppendVISAArithmeticInst(ISA_ADD, nullptr, false,
+                                                 vISA_EMASK_M1, EXEC_SIZE_1,
+                                                 TmpOffDst, SpOpSrc, Imm));
+      if (Subtarget->useGlobalMem()) {
+        CISA_CALL(Kernel->AppendVISASvmBlockLoadInst(
+            getCisaOwordNumFromNumber(Sz), false, TmpOffSrc, RawSrc));
+      } else {
+        CISA_CALL(Kernel->AppendVISASurfAccessOwordLoadStoreInst(
+            ISA_OWORD_LD, vISA_EMASK_M1, Src, getCisaOwordNumFromNumber(Sz),
+            TmpOffSrc, RawSrc));
+      }
+
+      int CopySz = std::min(ByteSz, TotalSz);
+      SrcRowOff = SrcColOff = 0;
+      emitVectorCopy(Dst, TmpVar, RowOff, ColOff, SrcRowOff, SrcColOff, CopySz);
+      TotalSz -= ByteSz;
+    }
+    SrcRowOff = SrcColOff = 0;
+  };
+
+  partCopy(8);
+  partCopy(4);
+  partCopy(2);
+  partCopy(1);
+}
+
+/**************************************************************************************************
+ * beginFunction : emit function prologue and arguments passing code
+ *
+ * Emit stack-related function prologue if Func is a kernel and there're
+ * stackcalls or Func is a stack function.
+ *
+ * Prologue performs Sp and Fp initialization (both for kernel and stack
+ * function). For stack functions arguments passing code is generated as well,
+ * %arg and stackmem passing is supported.
+ */
+void GenXKernelBuilder::beginFunction(Function *Func) {
+  VISA_GenVar *Sp = nullptr, *Fp = nullptr, *Hwtid = nullptr;
+  CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP));
+  CISA_CALL(Kernel->GetPredefinedVar(Fp, PREDEFINED_FE_FP));
+  CISA_CALL(Kernel->GetPredefinedVar(Hwtid, PREDEFINED_HW_TID));
+
+  VISA_VectorOpnd *SpOpSrc = nullptr;
+  VISA_VectorOpnd *SpOpSrc1 = nullptr;
+  VISA_VectorOpnd *SpOpDst = nullptr;
+  VISA_VectorOpnd *SpOpDst1 = nullptr;
+  VISA_VectorOpnd *FpOpDst = nullptr;
+  VISA_VectorOpnd *FpOpSrc = nullptr;
+  VISA_VectorOpnd *Imm = nullptr;
+
+  CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0));
+  CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst1, Sp, 1, 0, 0));
+  CISA_CALL(Kernel->CreateVISADstOperand(FpOpDst, Fp, 1, 0, 0));
+
+  CISA_CALL(
+      Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1, 0, 0, 0));
+  CISA_CALL(
+      Kernel->CreateVISASrcOperand(SpOpSrc1, Sp, MODIFIER_NONE, 0, 1, 0, 0, 0));
+
+  CISA_CALL(
+      Kernel->CreateVISASrcOperand(FpOpSrc, Fp, MODIFIER_NONE, 0, 1, 0, 0, 0));
+
+  if (isKernel(Func) && (HasStackcalls || HasAlloca)) {
+    // init kernel stack
+    VISA_GenVar *Hwtid = nullptr;
+    CISA_CALL(Kernel->GetPredefinedVar(Hwtid, PREDEFINED_HW_TID));
+
+    VISA_VectorOpnd *HwtidOp = nullptr;
+    uint32_t Val = STACK_PER_THREAD;
+
+    CISA_CALL(Kernel->CreateVISAImmediate(Imm, &Val, ISA_TYPE_UD));
+    CISA_CALL(Kernel->CreateVISASrcOperand(HwtidOp, Hwtid, MODIFIER_NONE, 0, 1,
+                                           0, 0, 0));
+
+    if (StackSurf == PREDEFINED_SURFACE_STACK) {
+      CISA_CALL(Kernel->AppendVISAArithmeticInst(
+          ISA_MUL, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+          EXEC_SIZE_1, SpOpDst, HwtidOp, Imm));
+    } else {
+      VISA_GenVar *Tmp = nullptr;
+      CISA_CALL(
+          Kernel->CreateVISAGenVar(Tmp, "SpOff", 1, ISA_TYPE_UQ, ALIGN_DWORD));
+
+      VISA_VectorOpnd *OffOpDst = nullptr;
+      VISA_VectorOpnd *OffOpSrc = nullptr;
+      CISA_CALL(Kernel->CreateVISADstOperand(OffOpDst, Tmp, 1, 0, 0));
+      CISA_CALL(Kernel->CreateVISASrcOperand(OffOpSrc, Tmp, MODIFIER_NONE, 0, 1,
+                                             0, 0, 0));
+      CISA_CALL(Kernel->AppendVISAArithmeticInst(
+          ISA_MUL, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+          EXEC_SIZE_1, OffOpDst, HwtidOp, Imm));
+
+      VISA_VectorOpnd *OpSrc = nullptr;
+      if (Subtarget->useGlobalMem()) {
+        assert(Func->arg_size() > 0);
+        Value &PrivBase = *(Func->arg_end() - 1);
+        genx::KernelArgInfo AI(TheKernelMetadata.getArgKind(Func->arg_size() - 1));
+        assert(AI.isPrivateBase());
+        OpSrc = createSource(&PrivBase, DONTCARESIGNED);
+      } else {
+        VISA_GenVar *R0 = nullptr;
+        CISA_CALL(Kernel->GetPredefinedVar(R0, PREDEFINED_R0));
+
+        CISA_CALL(Kernel->CreateVISASrcOperand(OpSrc, R0, MODIFIER_NONE, 0, 1,
+                                               0, 0, 5));
+      }
+      CISA_CALL(Kernel->AppendVISADataMovementInst(
+          ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+          EXEC_SIZE_1, SpOpDst, OpSrc));
+      Kernel->AppendVISAArithmeticInst(
+          ISA_ADD, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+          EXEC_SIZE_1, SpOpDst1, SpOpSrc1, OffOpSrc);
+    }
+    CISA_CALL(Kernel->AppendVISADataMovementInst(
+        ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+        EXEC_SIZE_1, FpOpDst, SpOpSrc));
+    // use the max available for now
+    unsigned SMO = Subtarget ? Subtarget->stackSurfaceMaxSize() : 8192;
+    Kernel->AddKernelAttribute("SpillMemOffset", 4, &SMO);
+  } else if (Func->hasFnAttribute(genx::FunctionMD::CMStackCall) ||
+             Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly)) {
+    if (Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly)) {
+      int ExtVal = 1;
+      Kernel->AddKernelAttribute("Extern", 4, &ExtVal);
+    }
+    // stack function prologue
+    VISA_GenVar *FpTmp = nullptr;
+
+    auto *ArgVar = &CisaVars[Kernel].at("argv");
+    auto *RetVar = &CisaVars[Kernel].at("retv");
+
+    if (FPMap.count(Func) == 0) {
+      CISA_CALL(
+          Kernel->CreateVISAGenVar(FpTmp, "tmp", 1, ISA_TYPE_UD, ALIGN_DWORD));
+      FPMap.insert(std::pair<Function *, VISA_GenVar *>(Func, FpTmp));
+    } else
+      FpTmp = FPMap[Func];
+
+    // init func stack pointers
+    VISA_VectorOpnd *TmpOp = nullptr;
+    CISA_CALL(Kernel->CreateVISADstOperand(TmpOp, FpTmp, 1, 0, 0));
+
+    Kernel->AppendVISADataMovementInst(
+        ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+        EXEC_SIZE_1, TmpOp, FpOpSrc);
+    Kernel->AppendVISADataMovementInst(
+        ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+        EXEC_SIZE_1, FpOpDst, SpOpSrc);
+
+    // unpack args
+    int Sz = 0, StackOff = 0;
+    unsigned RowOff = 0, ColOff = 0, SrcRowOff = 0, SrcColOff = 0;
+    bool StackStarted = false;
+    unsigned NoStackSize = 0;
+    // NOTE: using reverse iterators for args would be much better we don't have
+    // any though
+    for (auto &FArg : Func->args()) {
+      if (Liveness->getLiveRange(&FArg) &&
+          Liveness->getLiveRange(&FArg)->getCategory() == RegCategory::EM)
+        continue;
+
+      RowOff = 0, ColOff = 0;
+      unsigned ArgSize = getValueSize(FArg.getType());
+      if (SrcColOff &&
+          (FArg.getType()->isVectorTy() || ArgSize > (GrfByteSize - ColOff))) {
+        SrcRowOff++;
+        SrcColOff = 0;
+        NoStackSize++;
+      }
+      if (Liveness->getLiveRange(&FArg)->getCategory() ==
+          RegCategory::PREDICATE) {
+        VISA_VectorOpnd *argSrc = nullptr;
+        Kernel->CreateVISASrcOperand(
+            argSrc,
+            ArgVar->getAlias(llvmToVisaType(FArg.getType()), Kernel)
+                ->getGenVar(),
+            MODIFIER_NONE, 0, 1, 0, SrcRowOff, SrcColOff);
+        auto *PReg =
+            RegAlloc->getRegForValueOrNull(KernFunc, SimpleValue(&FArg));
+        assert(PReg);
+        Kernel->AppendVISASetP(vISA_EMASK_M1_NM, EXEC_SIZE_32,
+                               PReg->GetVar<VISA_PredVar>(Kernel), argSrc);
+      } else {
+        if ((int)ArgVar->getByteSize() - SrcRowOff * GrfByteSize >= ArgSize &&
+            !StackStarted) {
+          emitVectorCopy(&FArg, ArgVar->getAlias(&FArg, Kernel), RowOff, ColOff,
+                         SrcRowOff, SrcColOff, getValueSize(&FArg));
+          NoStackSize = RowOff;
+        } else {
+          StackStarted = true;
+          VISA_StateOpndHandle *stackSurf = nullptr;
+          VISA_SurfaceVar *stackSurfVar = nullptr;
+          CISA_CALL(Kernel->GetPredefinedSurface(stackSurfVar, StackSurf));
+          CISA_CALL(
+              Kernel->CreateVISAStateOperandHandle(stackSurf, stackSurfVar));
+          popStackArg(&FArg, stackSurf, ArgSize, RowOff, ColOff, SrcRowOff,
+                      SrcColOff, StackOff);
+        }
+      }
+      Sz += ArgSize;
+    }
+    if (!StackStarted && ColOff)
+      NoStackSize++;
+    auto *StackCallee = Func2Kern[Func];
+    auto *FuncTy = Func->getFunctionType();
+    int RetSize =
+        (FuncTy->getReturnType()->isVoidTy() ||
+         getValueSize(FuncTy->getReturnType()) > RetVar->getByteSize())
+            ? 0
+            : (getValueSize(FuncTy->getReturnType()) + GrfByteSize - 1) /
+                  GrfByteSize;
+
+    StackCallee->SetFunctionInputSize(NoStackSize);
+    StackCallee->SetFunctionReturnSize(RetSize);
+    StackCallee->AddKernelAttribute("ArgSize", 1, &NoStackSize);
+    StackCallee->AddKernelAttribute("RetValSize", 1, &RetSize);
+  }
+}
+
+/**************************************************************************************************
+ * endFunction : emit function epilogue and return value passing code
+ *
+ * Emit stack-related function epilogue if Func is a stack function.
+ *
+ * Epilogue restores Sp and Fp. Return value may be passed either visa %retval
+ * arg or stackmem, both scalar/vector and aggregate types are supported (please
+ * also see build[Extract|Insert]Value).
+ */
+void GenXKernelBuilder::endFunction(Function *Func, ReturnInst *RI) {
+  if (!isKernel(Func) &&
+      (Func->hasFnAttribute(genx::FunctionMD::CMStackCall) ||
+       Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly))) {
+    VISA_GenVar *Sp = nullptr, *Fp = nullptr;
+    CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP));
+    CISA_CALL(Kernel->GetPredefinedVar(Fp, PREDEFINED_FE_FP));
+
+    VISA_VectorOpnd *SpOpSrc = nullptr;
+    VISA_VectorOpnd *SpOpDst = nullptr;
+    VISA_VectorOpnd *FpOpDst = nullptr;
+    VISA_VectorOpnd *FpOpSrc = nullptr;
+
+    CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0));
+    CISA_CALL(Kernel->CreateVISADstOperand(FpOpDst, Fp, 1, 0, 0));
+    CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1,
+                                           0, 0, 0));
+    CISA_CALL(Kernel->CreateVISASrcOperand(FpOpSrc, Fp, MODIFIER_NONE, 0, 1,
+                                           0, 0, 0));
+
+    VISA_VectorOpnd *TmpOp = nullptr;
+    CISA_CALL(Kernel->CreateVISASrcOperand(TmpOp, FPMap[Func], MODIFIER_NONE,
+                                           0, 1, 0, 0, 0));
+
+    Kernel->AppendVISADataMovementInst(
+        ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+        EXEC_SIZE_1, SpOpDst, FpOpSrc);
+    Kernel->AppendVISADataMovementInst(
+        ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+        EXEC_SIZE_1, FpOpDst, TmpOp);
+
+    VISA_GenVar *Ret = nullptr;
+    CISA_CALL(Kernel->GetPredefinedVar(Ret, PREDEFINED_RET));
+
+    if (!Func->getReturnType()->isVoidTy() &&
+        !Func->getReturnType()->isAggregateType() &&
+        Liveness->getLiveRange(RI->getReturnValue()) &&
+        (Liveness->getLiveRange(RI->getReturnValue())->getCategory() !=
+             RegCategory::EM &&
+         Liveness->getLiveRange(RI->getReturnValue())->getCategory() !=
+             RegCategory::PREDICATE)) {
+      GenericCisaVariable *RetVar = &CisaVars[Kernel].at("retv");
+      assert(!Func->getReturnType()->isAggregateType());
+
+      // pack retval
+      unsigned RowOff = 0, ColOff = 0, SrcRowOff = 0, SrcColOff = 0;
+      if (getValueSize(Func->getReturnType()) <=
+          RetVar->getByteSize()) {
+        unsigned RowOff = 0, ColOff = 0, SrcRowOff = 0, SrcColOff = 0;
+        emitVectorCopy(RetVar->getAlias(RI->getReturnValue(), Kernel), RI->getReturnValue(),
+          RowOff, ColOff, SrcRowOff,
+                       SrcColOff, getValueSize(RI->getReturnValue()));
+      } else {
+        VISA_StateOpndHandle *StackSurfOp = nullptr;
+        VISA_SurfaceVar *StackSurfVar = nullptr;
+        CISA_CALL(Kernel->GetPredefinedSurface(StackSurfVar,
+                                               StackSurf));
+        CISA_CALL(
+            Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar));
+        pushStackArg(StackSurfOp, RI->getReturnValue(),
+                     getValueSize(Func->getReturnType()), RowOff, ColOff,
+                     SrcRowOff, SrcColOff);
+      }
+    }
+    for (auto II : RetvInserts)
+      buildInsertRetv(II);
+    RetvInserts.clear();
+  }
+}
+
+void GenXKernelBuilder::buildExtractRetv(ExtractValueInst *Inst) {
+  auto T = Inst->getOperand(0)->getType();
+  auto *RetVar = &CisaVars[Kernel].at("retv");
+
+  bool UseStack = getValueSize(T) > RetVar->getByteSize();
+
+  auto Index = Inst->getIndices().front();
+  if (T->getContainedType(Index)->isVectorTy() &&
+      T->getContainedType(Index)->getVectorElementType()->isIntegerTy(1))
+    // elements of <N x i1> type should be ignored
+    return;
+
+  unsigned RowOff = 0, ColOff = 0;
+  unsigned SrcRowOff = 0, SrcColOff = 0;
+  for (unsigned i = 0; i < Index; i++) {
+    int Mod = UseStack ? BYTES_PER_OWORD : GrfByteSize;
+    SrcRowOff += (getValueSize(T->getContainedType(i)) + Mod - 1) / Mod;
+  }
+
+  if (UseStack) {
+    int Prev = SrcRowOff;
+    VISA_StateOpndHandle *StackSurfOp = nullptr;
+    VISA_SurfaceVar *StackSurfVar = nullptr;
+    CISA_CALL(
+        Kernel->GetPredefinedSurface(StackSurfVar, StackSurf));
+    CISA_CALL(Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar));
+    popStackArg(Inst, StackSurfOp, getValueSize(T->getContainedType(Index)),
+                RowOff, ColOff, SrcRowOff, SrcColOff, Prev);
+  } else
+    emitVectorCopy(Inst, RetVar->getAlias(Inst, Kernel), RowOff, ColOff,
+                   SrcRowOff, SrcColOff, getValueSize(Inst));
+}
+
+void GenXKernelBuilder::buildInsertRetv(InsertValueInst *Inst) {
+  auto T = Inst->getOperand(0)->getType();
+  auto *RetVar = &CisaVars[Kernel].at("retv");
+
+  bool UseStack = getValueSize(T) > RetVar->getByteSize();
+
+  auto Index = Inst->getIndices().front();
+  if (T->getContainedType(Index)->isVectorTy() &&
+      T->getContainedType(Index)->getVectorElementType()->isIntegerTy(1)) {
+    // elements of <N x i1> type should be ignored
+    return;
+  }
+
+  unsigned RowOff = 0, ColOff = 0;
+  unsigned SrcRowOff = 0, SrcColOff = 0;
+
+  if (!UseStack)
+    for (unsigned i = 0; i < Index; i++)
+      RowOff += (getValueSize(T->getContainedType(i)) + GrfByteSize - 1) /
+                GrfByteSize;
+
+  if (UseStack) {
+    VISA_StateOpndHandle *StackSurfOp = nullptr;
+    VISA_SurfaceVar *StackSurfVar = nullptr;
+    CISA_CALL(
+        Kernel->GetPredefinedSurface(StackSurfVar, StackSurf));
+    CISA_CALL(Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar));
+    pushStackArg(StackSurfOp, Inst->getOperand(1),
+                 getValueSize(T->getContainedType(Index)), RowOff, ColOff,
+                 SrcRowOff, SrcColOff);
+  } else
+    emitVectorCopy(RetVar->getAlias(Inst->getOperand(1), Kernel),
+                   Inst->getOperand(1), RowOff, ColOff, SrcRowOff, SrcColOff,
+                   getValueSize(Inst->getOperand(1)));
+}
+
+void GenXKernelBuilder::buildStackCall(IGCLLVM::CallInst *CI,
+                                       const DstOpndDesc &DstDesc) {
+  LLVM_DEBUG(dbgs() << "Build stack call\n"; CI->print(dbgs()); dbgs() << "\n");
+  Function *Callee = CI->getCalledFunction();
+  auto *FuncTy = CI->getFunctionType();
+  auto *StackCallee = Func2Kern[Callee];
+  assert(CI->isIndirectCall() || StackCallee);
+
+  // Check whether the called function has a predicate arg that is EM.
+  int EMOperandNum = -1, EMIdx = -1;
+  for (auto &Arg : CI->arg_operands()) {
+    ++EMIdx;
+    if (!Arg->getType()->getScalarType()->isIntegerTy(1))
+      continue;
+    if (Liveness->getLiveRange(Arg)->getCategory() == RegCategory::EM) {
+      EMOperandNum = EMIdx;
+      break;
+    }
+  }
+
+  int TotalArgSize = 0;
+  for (auto &CallArg : CI->arg_operands())
+    TotalArgSize += getValueSize(CallArg->getType());
+
+  VISA_GenVar *Sp = nullptr, *Arg = nullptr, *Ret = nullptr;
+  CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP));
+  CISA_CALL(Kernel->GetPredefinedVar(Arg, PREDEFINED_ARG));
+  CISA_CALL(Kernel->GetPredefinedVar(Ret, PREDEFINED_RET));
+
+  unsigned ColOff = 0, RowOff = 0, SrcRowOff = 0, SrcColOff = 0;
+
+  int Sz = 0, NoStackSize = 0, StackArgSz = 0;
+  uint64_t StackOff = 0;
+  bool StackStarted = false;
+  // pack arguments
+  for (auto &CallArg : CI->arg_operands()) {
+    auto *CallArgLR = Liveness->getLiveRangeOrNull(CallArg.get());
+    if (CallArgLR && CallArgLR->getCategory() == RegCategory::EM)
+      continue;
+
+    assert(!CallArg->getType()->isAggregateType());
+    SrcRowOff = 0, SrcColOff = 0;
+    unsigned ArgSize = getValueSize(CallArg->getType());
+
+    if (ColOff && (CallArg->getType()->isVectorTy() ||
+                   ArgSize > (GrfByteSize - ColOff))) {
+      RowOff++;
+      ColOff = 0;
+      // adjust size if we use only a part the last used GRF
+      NoStackSize++;
+    }
+
+    bool IsUndef = isa<UndefValue>(CallArg);
+    auto *ArgVar = &CisaVars[Kernel].at("argv");
+    if ((int)ArgVar->getByteSize() - RowOff * GrfByteSize >= ArgSize &&
+        !StackStarted) {
+      assert(ArgSize <= Sz - ArgVar->getByteSize() &&
+             "cannot pass arg via stack and %arg as well");
+
+      SrcRowOff = 0, SrcColOff = 0;
+      if (!IsUndef && CallArgLR->getCategory() == RegCategory::PREDICATE) {
+        VISA_VectorOpnd *PredDst = nullptr;
+        Kernel->CreateVISADstOperand(
+            PredDst,
+            ArgVar->getAlias(llvmToVisaType(CallArg->getType()), Kernel)
+                ->getGenVar(),
+            1, RowOff, ColOff);
+        auto PReg =
+            RegAlloc->getRegForValueOrNull(KernFunc, SimpleValue(CallArg));
+        assert(PReg);
+        Kernel->AppendVISAPredicateMove(PredDst,
+                                        PReg->GetVar<VISA_PredVar>(Kernel));
+        ColOff += ArgSize;
+      } else
+        emitVectorCopy<CisaVariable, Value>(
+            ArgVar->getAlias(CallArg, Kernel), CallArg, RowOff, ColOff,
+            SrcRowOff, SrcColOff, getValueSize(CallArg), !IsUndef);
+      Sz += ArgSize;
+      NoStackSize = RowOff;
+    } else {
+      StackStarted = true;
+      RowOff = ColOff = 0;
+      SrcRowOff = SrcColOff = 0;
+      VISA_StateOpndHandle *StackSurfOp = nullptr;
+      VISA_SurfaceVar *StackSurfVar = nullptr;
+      CISA_CALL(
+          Kernel->GetPredefinedSurface(StackSurfVar, StackSurf));
+      CISA_CALL(Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar));
+      pushStackArg(StackSurfOp, CallArg, ArgSize, RowOff, ColOff, SrcRowOff,
+                   SrcColOff, !IsUndef);
+
+      StackArgSz += (ArgSize / BYTES_PER_OWORD) + (ArgSize % BYTES_PER_OWORD ? 1 : 0);
+      StackOff = -StackArgSz;
+    }
+  }
+  if (!StackStarted && ColOff)
+    NoStackSize++;
+
+  VISA_VectorOpnd *SpOpSrc = nullptr, *SpOpDst = nullptr, *Imm = nullptr;
+  if (StackOff) {
+    CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0));
+    CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1, 0,
+                                           0, 0));
+
+    if (Subtarget->useGlobalMem())
+      StackOff *= BYTES_PER_OWORD;
+    CISA_CALL(Kernel->CreateVISAImmediate(Imm, &StackOff, ISA_TYPE_UQ));
+    CISA_CALL(Kernel->AppendVISAArithmeticInst(
+        ISA_ADD, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+        EXEC_SIZE_1, SpOpDst, SpOpSrc, Imm));
+  }
+
+  VISA_PredOpnd *Pred = nullptr;
+  VISA_Exec_Size Esz = EXEC_SIZE_16;
+  if (EMOperandNum >= 0) {
+    Pred = createPred(CI, BaleInfo(), EMOperandNum);
+    Esz = getExecSizeFromValue(
+        CI->getArgOperand(EMOperandNum)->getType()->getVectorNumElements());
+  }
+  addDebugInfo();
+
+  auto *RetVar = &CisaVars[Kernel].at("retv");
+  bool ProcessRet =
+      !FuncTy->getReturnType()->isVoidTy() &&
+      !FuncTy->getReturnType()->isAggregateType() &&
+      !(FuncTy->getReturnType()->isVectorTy() &&
+        FuncTy->getReturnType()->getVectorElementType()->isIntegerTy(1));
+
+  // cannot use processRet here since aggr/em args should be co
+  int RetSize =
+      (FuncTy->getReturnType()->isVoidTy() ||
+       getValueSize(FuncTy->getReturnType()) > RetVar->getByteSize())
+          ? 0
+          : (getValueSize(FuncTy->getReturnType()) + GrfByteSize - 1) /
+                GrfByteSize;
+  if (Callee) {
+    CISA_CALL(Kernel->AppendVISACFFunctionCallInst(
+        Pred, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), EXEC_SIZE_16,
+        Callee->getName(), NoStackSize, RetSize));
+  } else {
+    auto *FuncAddr = createSource(CI->getCalledValue(), DONTCARESIGNED);
+    assert(FuncAddr);
+    CISA_CALL(Kernel->AppendVISACFIndirectFuncCallInst(
+        Pred, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), EXEC_SIZE_16,
+        FuncAddr, NoStackSize, RetSize));
+  }
+
+  unsigned StackRetSz = 0;
+  if (!FuncTy->getReturnType()->isVoidTy() &&
+      getValueSize(FuncTy->getReturnType()) > RetVar->getByteSize())
+    StackRetSz = (getValueSize(FuncTy->getReturnType(), BYTES_PER_OWORD) / BYTES_PER_OWORD +
+                  ((getValueSize(FuncTy->getReturnType(), BYTES_PER_OWORD) % BYTES_PER_OWORD) ? 1 : 0));
+  // unpack retval
+  if (ProcessRet && Liveness->getLiveRange(CI) &&
+      Liveness->getLiveRange(CI)->getCategory() != RegCategory::EM) {
+    unsigned RowOff = 0, ColOff = 0, SrcRowOff = 0, SrcColOff = 0;
+    if (getValueSize(FuncTy->getReturnType()) <= RetVar->getByteSize()) {
+      emitVectorCopy(CI, RetVar->getAlias(CI, Kernel), RowOff, ColOff,
+                     SrcRowOff, SrcColOff, getValueSize(CI));
+    } else {
+      int StackOffVal = -StackRetSz;
+      VISA_StateOpndHandle *StackSurfOp = nullptr;
+      VISA_SurfaceVar *StackSurfVar = nullptr;
+      CISA_CALL(
+          Kernel->GetPredefinedSurface(StackSurfVar, StackSurf));
+      CISA_CALL(Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar));
+      popStackArg(CI, StackSurfOp, getValueSize(Callee->getReturnType()), RowOff,
+                  ColOff, SrcRowOff, SrcColOff, StackOffVal);
+    }
+  }
+  // restore Sp
+  CISA_CALL(
+      Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1, 0, 0, 0));
+  CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0));
+  uint64_t OffVal = -StackRetSz;
+  CISA_CALL(Kernel->CreateVISAImmediate(Imm, &OffVal, ISA_TYPE_UQ));
+  CISA_CALL(Kernel->AppendVISAArithmeticInst(
+      ISA_ADD, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1),
+      EXEC_SIZE_1, SpOpDst, SpOpSrc, Imm));
+}
+
+namespace {
+
+class GenXFinalizer : public ModulePass {
+  raw_pwrite_stream &Out;
+  LLVMContext *Ctx = nullptr;
+
+public:
+  static char ID;
+  explicit GenXFinalizer(raw_pwrite_stream &o) : ModulePass(ID), Out(o) {}
+
+  virtual StringRef getPassName() const { return "GenX Finalizer"; }
+
+  LLVMContext &getContext() {
+    assert(Ctx);
+    return *Ctx;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<GenXModule>();
+    AU.addRequired<FunctionGroupAnalysis>();
+    AU.addRequired<GenXSubtargetPass>();
+    AU.setPreservesAll();
+  }
+
+  void fillOCLRuntimeInfo(GenXOCLRuntimeInfo &Info, GenXModule &GM,
+                          FunctionGroupAnalysis &FGA, const GenXSubtarget &ST);
+
+  bool runOnModule(Module &M) {
+    Ctx = &M.getContext();
+
+    GenXModule &GM = getAnalysis<GenXModule>();
+    FunctionGroupAnalysis &FGA = getAnalysis<FunctionGroupAnalysis>();
+    GenXOCLRuntimeInfo *OCLInfo = getAnalysisIfAvailable<GenXOCLRuntimeInfo>();
+    const GenXSubtarget &ST = *getAnalysis<GenXSubtargetPass>().getSubtarget();
+
+    std::stringstream ss;
+    auto *CisaBuilder = GM.GetCisaBuilder();
+    if (GM.HasInlineAsm()) {
+      auto VISAAsmTextReader = GM.GetVISAAsmReader();
+      CISA_CALL(VISAAsmTextReader->Compile("genxir", &ss, EmitVisa));
+    } else
+      CISA_CALL(CisaBuilder->Compile("genxir", &ss, EmitVisa));
+    if (OCLInfo)
+      fillOCLRuntimeInfo(*OCLInfo, GM, FGA, ST);
+    dbgs() << CisaBuilder->GetCriticalMsg();
+    GM.DestroyCISABuilder();
+    GM.DestroyVISAAsmReader();
+    Out << ss.str();
+    return false;
+  }
+};
+} // end anonymous namespace.
+
+char GenXFinalizer::ID = 0;
+
+ModulePass *llvm::createGenXFinalizerPass(raw_pwrite_stream &o) {
+  return new GenXFinalizer(o);
+}
+
+static void constructSymbolTable(FunctionGroup &FG, GenXModule &GM,
+                                 void *&Buffer, unsigned &ByteSize,
+                                 unsigned &NumEntries) {
+  NumEntries = std::count_if(FG.begin(), FG.end(), [](Function *F) {
+    return F->hasFnAttribute("referenced-indirectly");
+  });
+  ByteSize = NumEntries * sizeof(vISA::GenSymEntry);
+  // this will be eventually freed in AdaptorOCL
+  Buffer = new vISA::GenSymEntry[NumEntries];
+  auto *Entry = static_cast<vISA::GenSymEntry *>(Buffer);
+  for (auto &F : FG)
+    if (F->hasFnAttribute("referenced-indirectly")) {
+      assert(F->getName().size() <= vISA::MAX_SYMBOL_NAME_LENGTH);
+      strcpy_s(Entry->s_name, vISA::MAX_SYMBOL_NAME_LENGTH,
+               F->getName().str().c_str());
+      VISAFunction *Func = static_cast<VISAFunction *>(GM.getVISAKernel(F));
+      Entry->s_type = vISA::GenSymType::S_FUNC;
+      Entry->s_offset = Func->getGenOffset();
+      Entry->s_size = Func->getGenSize();
+      Entry++;
+    }
+}
+
+void GenXFinalizer::fillOCLRuntimeInfo(GenXOCLRuntimeInfo &OCLInfo,
+                                       GenXModule &GM,
+                                       FunctionGroupAnalysis &FGA,
+                                       const GenXSubtarget &ST) {
+  using KernelInfo = GenXOCLRuntimeInfo::KernelInfo;
+  using CompiledKernel = GenXOCLRuntimeInfo::CompiledKernel;
+  using TableInfo = GenXOCLRuntimeInfo::TableInfo;
+  for (auto *FG : FGA) {
+    // Compiler info.
+    KernelInfo Info{*FG, ST};
+
+    // Finalizer info (jitter struct and gen binary).
+    VISAKernel *BuiltKernel = GM.getVISAKernel(FG->getHead());
+    assert(BuiltKernel);
+    FINALIZER_INFO *JitInfo = nullptr;
+    BuiltKernel->GetJitInfo(JitInfo);
+    assert(JitInfo && "Jit info is not set by finalizer");
+    void *GenBin = nullptr;
+    int GenBinSize = 0; // Finalizer uses signed int for size...
+    BuiltKernel->GetGenxBinary(GenBin, GenBinSize);
+    assert(GenBin && GenBinSize &&
+           "Unexpected null buffer or zero-sized kernel (compilation failed?)");
+    TableInfo &RTable = Info.getRelocationTable();
+    CISA_CALL(BuiltKernel->GetGenRelocEntryBuffer(RTable.Buffer, RTable.Size, RTable.Entries));
+    TableInfo &STable = Info.getSymbolTable();
+    constructSymbolTable(*FG, GM, STable.Buffer, STable.Size, STable.Entries);
+
+    // Save it all here.
+    CompiledKernel FullInfo{std::move(Info), *JitInfo,
+                            ArrayRef<char>{static_cast<char *>(GenBin),
+                                           static_cast<size_t>(GenBinSize)}};
+    OCLInfo.saveCompiledKernel(std::move(FullInfo));
+
+    freeBlock(GenBin);
+  }
+}
+
+void GenXModule::clearFinalizerArgs(std::vector<const char*>& Owner) const {
+  std::for_each(Owner.begin(), Owner.end(), [](const char* a) { delete []a; });
+  Owner.clear();
+}
+
+void GenXModule::collectFinalizerArgs(std::vector<const char*> &Owner) const {
+  clearFinalizerArgs(Owner);
+
+  auto grantArgument = [](const std::string& ArgString,
+                          std::vector<const char*> &Owner) {
+    const size_t BufferSize = ArgString.size() + 1;
+    char* ArgCopyBuff = new char [BufferSize];
+    std::memcpy(ArgCopyBuff, ArgString.data(), BufferSize);
+    Owner.push_back(ArgCopyBuff);
+  };
+
+  grantArgument("-dumpvisa", Owner);
+  for (const auto& Fos: FinalizerOpts) {
+    // Add additional arguments if specified
+    std::istringstream f(Fos);
+    std::string s;
+    while (getline(f, s, ' ')) {
+      grantArgument(s, Owner);
+    }
+  }
+  Owner.push_back(nullptr);
+}
+
+LLVMContext &GenXModule::getContext() {
+  assert(Ctx);
+  return *Ctx;
+}
+
+void GenXModule::InitCISABuilder() {
+  assert(ST);
+  auto Platform = ST->getVisaPlatform();
+  // Use SKL for unknown platforms
+  if (Platform == GENX_NONE)
+    Platform = GENX_SKL;
+
+  // Prepare array of arguments for Builder API.
+  collectFinalizerArgs(CISA_Args);
+
+  if (PrintFinalizerOptions.getValue()) {
+    outs() << "Finalizer Parameters:\n\t" << " -platform " << ST->getCPU();
+    std::for_each(CISA_Args.begin(), CISA_Args.end(),
+                  [](const char* Arg) { outs() << " " << Arg; });
+    outs() << "\n";
+  }
+
+  CISA_CALL(CreateVISABuilder(CisaBuilder,
+                              HasInlineAsm() ? vISA_ASM_WRITER : vISA_MEDIA,
+                              EmitVisa ? VISA_BUILDER_VISA : VISA_BUILDER_BOTH,
+                              Platform, CISA_Args.size() - 1, CISA_Args.data(),
+                              WaTable));
+  assert(CisaBuilder && "Failed to create VISABuilder!");
+}
+
+VISABuilder *GenXModule::GetCisaBuilder() {
+  if (!CisaBuilder)
+    InitCISABuilder();
+  return CisaBuilder;
+}
+
+void GenXModule::DestroyCISABuilder() {
+  if (CisaBuilder) {
+    CISA_CALL(DestroyVISABuilder(CisaBuilder));
+    CisaBuilder = nullptr;
+  }
+}
+
+void GenXModule::InitVISAAsmReader() {
+  assert(ST);
+  auto Platform = ST->getVisaPlatform();
+  // Use SKL for unknown platforms
+  if (Platform == GENX_NONE)
+    Platform = GENX_SKL;
+
+  // Prepare array of arguments for Builder API.
+  collectFinalizerArgs(VISA_Args);
+
+  // Prepare array of arguments for Builder API.
+  if (PrintFinalizerOptions.getValue()) {
+    outs() << "Finalizer Parameters:\n\t" << " -platform " << ST->getCPU();
+    std::for_each(VISA_Args.begin(), VISA_Args.end(),
+                  [](const char* Arg) { outs() << " " << Arg; });
+    outs() << "\n";
+  }
+
+  CISA_CALL(CreateVISABuilder(VISAAsmTextReader, vISA_ASM_READER,
+                              VISA_BUILDER_BOTH, Platform,
+                              VISA_Args.size() - 1, VISA_Args.data(),
+                              WaTable));
+  assert(VISAAsmTextReader && "Failed to create VISAAsmTextReader!");
+}
+
+VISABuilder *GenXModule::GetVISAAsmReader() {
+  if (!VISAAsmTextReader)
+    InitVISAAsmReader();
+  return VISAAsmTextReader;
+}
+
+void GenXModule::DestroyVISAAsmReader() {
+  if (VISAAsmTextReader) {
+    CISA_CALL(DestroyVISABuilder(VISAAsmTextReader));
+    VISAAsmTextReader = nullptr;
+  }
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXCoalescing.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCoalescing.cpp
new file mode 100644
index 000000000000..623bccd4c6fa
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCoalescing.cpp
@@ -0,0 +1,1759 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXCoalescing
+/// --------------
+///
+/// The LLVM target independent code generator, used by most backends, has a
+/// coalescing pass that runs after de-SSA of the machine IR and two-address
+/// handling, and attempts to remove the added copies by coalescing values. It
+/// also attempts to coalesce a value with a hardreg that it is copied to/from.
+///
+/// This GenX coalescing and copy insertion pass is a bit different, in that
+/// it runs on LLVM IR, which must remain in SSA, and it attempts to coalesce
+/// values to try and avoid adding the copy in the first place. In any phi node
+/// or two address op where it fails to coalesce, it inserts a copy (and
+/// coalesces the result of the copy into the result of the phi node or
+/// two address op).
+///
+/// There are three different kinds of coalescing. Copy coalescing is done first,
+/// then the other two are done together.
+///
+/// 1. Copy coalescing.
+///
+///    Generally there are no copy instructions in SSA, but we
+///    can treat a bitcast as a copy (the operand and result can live in the
+///    same register aliased in different registers), and an extractvalue is
+///    treated as a copy to be coalesced, and the "inserted value" operand
+///    and the corresponding element(s) of the result in an insertvalue are
+///    treated as a copy to be coalesced.
+///
+///    Copy coalescing represents two values that are known to be identical
+///    occupying the same register at the same time, thus it is possible even
+///    if the two values interfere (are live at the same point). Because we
+///    handle copy coalescing before any other kind of coalescing, it usually
+///    succeeds.
+///
+///    This only works because we do copy coalescing first, so we know that
+///    neither value that we want to copy coalesce has already undergone normal
+///    or phi coalescing.
+///
+///    However there is a case when copy coalescing between two live ranges
+///    LR1 and LR2 (each of which is possibly already copy coalesced) cannot be
+///    allowed: when LR2 loops round and has a phi use in the same basic block
+///    as a phi definition in LR1, where the phi use of LR2 is after the phi
+///    definition of LR1. This can happen because LLVM IR does not attach any
+///    meaning to the order of phi nodes, but the GenX backend does with its
+///    instruction numbering.
+///
+///    This constraint on copy coalescing is embodied in the concept of
+///    "copy-interference". The two live ranges LR1 and LR2 copy-interfere,
+///    meaning they cannot be copy coalesced, if LR1 has a phi definition,
+///    one of whose numbers is within LR2's live range.
+///
+/// 2. Normal coalescing
+///
+///    This arises where we have a two-address operation, that is, it has an
+///    operand that needs to be in the same register as the result, because the
+///    instruction represents a partial write operation. The main example of
+///    this is wrregion, but there are also some shared function intrinsics
+///    that need this.
+///
+///    Here, we gather all the possible coalesces (including the phi ones),
+///    together with an estimate of the cost of failing to coalesce (due to
+///    needing to insert a copy), and then sort them in cost order and process
+///    them.
+///
+///    This kind of coalescing is possible only if the two live ranges do not
+///    interfere. If coalescing fails, we need to insert a copy just before
+///    the instruction, creating a new value with a very short live range
+///    that can trivially be coalesced with the result of the original
+///    instruction.
+///
+///    Some subkinds of normal coalescing are:
+///
+///    2a. call arg pre-copy
+///
+///        A call arg needs to be coalesced with or copied to the corresponding
+///        function arg.
+///
+///        Unlike most other kinds of coalescing, if coalescing fails, the copy
+///        insertion is delayed until later, so we can ensure that the copies
+///        are in the same order as the args, as the live ranges were computed
+///        on that basis.
+///
+///        Normally, call arg pre-copy coalescing occurs, like other normal
+///        coalescing, if the two live ranges do not interfere. If this fails,
+///        we can still do *call arg special coalescing* (CASC) of call arg A
+///        and function arg B as long as both of the following are true:
+///
+///         i. B has not been normal coalesced into anything (which would be
+///            in the subroutine or some other subroutine it calls), except
+///            that B is allowed to be call arg pre-copy coalesced;
+///
+///        ii. For any other call site where the corresponding call arg is not
+///            A, A does not interfere with it.
+///
+///        Call arg special coalescing allows call arg A and function arg B to
+///        be in the same register, even if A is used after the call, as long
+///        as that register is not already being used for a different value
+///        in the subroutine, and as long as a different value for the call
+///        arg is not used at a different call site where A is live.
+///
+///        **Note**: Call arg special coalescing is disabled, because it broke
+///        a test and I never got round to investigating why. I don't even know
+///        if it would be beneficial any more, given more recent changes to
+///        liveness and coalescing.
+///
+///    2b. ret value pre-copy
+///
+///        At a ReturnInst, the return value operand needs to be coalesced with
+///        or copied to the unified return value for the function. This is
+///        handled mostly the same as a normal coalesce.
+///
+///    2c. ret value post-copy
+///
+///        After a CallInst for a subroutine call, the unified return value
+///        needs to be coalesced with or copied to the result of the call. On
+///        failure, the copy insertion is delayed until later.
+///
+/// 3. Phi coalescing
+///
+///    This is how we "de-SSA" the code. A phi incoming wants to coalesce with
+///    the result of the phi node.
+///
+///    Again, this kind of coalescing is possible only if the two live ranges
+///    do not interfere. (A phi incoming can never interfere with its phi
+///    result, but earlier coalescing could make them now interfere.) If
+///    coalescing fails, we need to insert a copy at the end of the incoming
+///    predecessor basic block. In fact we defer the copy insertion from failed
+///    phi coalescing to the end, because we need to make sure the inserted
+///    copies are in the same order as the phi nodes, as that is the basis on
+///    which the live ranges were constructed.
+///
+///    After phi coalescing, the LLVM IR is still in SSA form, but the phi
+///    coalescing, and the copies inserted where phi coalescing failed, mean
+///    that it is trivial to transform into non-SSA vISA code: generate code for
+///    the phi copies, and ignore the phi nodes themselves because they are
+///    completely coalesced.
+///
+/// Kernel argument copying
+/// ^^^^^^^^^^^^^^^^^^^^^^^
+///
+/// The kernel argument offsets (i.e. where kernel arguments appear in the GRF
+/// on entry to the kernel) are set in a very early pass just after Clang
+/// codegen. This sets offsets and packs holes in a way that is specific to the
+/// language being compiled and its contract with its runtime.
+///
+/// However, when we get here, we may find that a live range that contains a
+/// kernel argument has an alignment requirement that the offset from
+/// earlier does not comply with.
+///
+/// So an extra function of this pass, after doing the coalescing, is to spot
+/// this case, where a kernel argument has an offset that is not aligned enough,
+/// and insert an extra copy at the start of the function.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_COALESCING"
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXGotoJoin.h"
+#include "GenXIntrinsics.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "GenXNumbering.h"
+#include "GenXRegion.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "vc/GenXOpts/Utils/KernelInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvmWrapper/IR/InstrTypes.h"
+#include <algorithm>
+#include <vector>
+
+using namespace llvm;
+using namespace genx;
+
+static cl::opt<unsigned> GenXShowCoalesceFailThreshold("genx-show-coalesce-fail-threshold", cl::init(UINT_MAX), cl::Hidden,
+                                      cl::desc("GenX size threshold (bytes) for showing coalesce fails."));
+
+// Diagnostic information for error/warning relating fast-composition.
+class DiagnosticInfoFastComposition : public DiagnosticInfo {
+private:
+  std::string Description;
+  StringRef Filename;
+  unsigned Line;
+  unsigned Col;
+  static int KindID;
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+public:
+  // Initialize from an Instruction and an Argument.
+  DiagnosticInfoFastComposition(Instruction *Inst,
+    const Twine &Desc, DiagnosticSeverity Severity = DS_Error);
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+int DiagnosticInfoFastComposition::KindID = 0;
+
+namespace {
+
+  // Candidate : description of a coalescing candidate
+  struct Candidate {
+    genx::SimpleValue Dest;
+    Use *UseInDest;
+    unsigned SourceIndex;
+    unsigned Priority;
+    unsigned Serial;
+    Candidate(SimpleValue Dest, Use *UseInDest, unsigned SourceIndex,
+          unsigned Priority, unsigned Serial)
+      : Dest(Dest), UseInDest(UseInDest), SourceIndex(SourceIndex),
+        Priority(Priority), Serial(Serial) {}
+    bool operator<(const Candidate &C2) const {
+      if (Priority != C2.Priority)
+        return Priority > C2.Priority;
+      // Make the sort order preserving for equal priority, to get consistent
+      // results across different runs.
+      return Serial < C2.Serial;
+    }
+  };
+
+  struct PhiCopy {
+    PHINode *Phi;
+    unsigned IncomingIdx;
+    PhiCopy(PHINode *Phi, unsigned IncomingIdx)
+        : Phi(Phi), IncomingIdx(IncomingIdx) {}
+  };
+
+  // GenX coalescing pass
+  class GenXCoalescing : public FunctionGroupPass {
+  private:
+    const GenXSubtarget *ST;
+    GenXBaling *Baling;
+    GenXLiveness *Liveness;
+    GenXNumbering *Numbering;
+    DominatorTreeGroupWrapperPass *DTWrapper;
+    std::vector<Candidate> CopyCandidates;
+    std::vector<Candidate> NormalCandidates;
+    std::vector<CallInst*> Callables;
+  public:
+    static char ID;
+    explicit GenXCoalescing() : FunctionGroupPass(ID) {}
+    virtual StringRef getPassName() const { return "GenX coalescing and copy insertion"; }
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      FunctionGroupPass::getAnalysisUsage(AU);
+      AU.addRequired<GenXLiveness>();
+      AU.addRequired<GenXGroupBaling>();
+      AU.addRequired<GenXNumbering>();
+      AU.addRequired<DominatorTreeGroupWrapperPass>();
+      AU.addPreserved<DominatorTreeGroupWrapperPass>();
+      AU.addPreserved<GenXGroupBaling>();
+      AU.addPreserved<GenXLiveness>();
+      AU.addPreserved<GenXModule>();
+      AU.addPreserved<GenXNumbering>();
+      AU.addPreserved<FunctionGroupAnalysis>();
+      AU.setPreservesCFG();
+    }
+    bool runOnFunctionGroup(FunctionGroup &FG);
+    // createPrinterPass : get a pass to print the IR, together with the GenX
+    // specific analyses
+    virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const
+    { return createGenXGroupPrinterPass(O, Banner); }
+  private:
+    void recordCandidates(FunctionGroup *FG);
+    unsigned getPriority(Type *Ty, BasicBlock *BB);
+    // Various permutations of the function to record a coalescing candidate.
+    void recordCopyCandidate(Value *Dest, Use *UseInDest, unsigned Priority) {
+      recordCandidate(SimpleValue(Dest), UseInDest, 0, Priority,
+                      &CopyCandidates);
+    }
+    void recordCopyCandidate(SimpleValue Dest, Use *UseInDest,
+                             unsigned SourceIndex, unsigned Priority) {
+      recordCandidate(Dest, UseInDest, SourceIndex, Priority, &CopyCandidates);
+    }
+    void recordNormalCandidate(Instruction *Dest, Use *UseInDest,
+                               unsigned Priority) {
+      recordCandidate(SimpleValue(Dest), UseInDest, 0, Priority,
+                      &NormalCandidates);
+    }
+    void recordNormalCandidate(SimpleValue Dest, Use *UseInDest,
+                               unsigned SourceIndex, unsigned Priority) {
+      recordCandidate(Dest, UseInDest, SourceIndex, Priority,
+                      &NormalCandidates);
+    }
+    void recordPhiCandidate(Value *Dest, Use *UseInDest, unsigned Priority) {
+      recordCandidate(SimpleValue(Dest), UseInDest, 0, Priority,
+                      &NormalCandidates);
+    }
+    void recordCandidate(SimpleValue Dest, Use *UseInDest, unsigned SourceIndex,
+                         unsigned Priority, std::vector<Candidate> *Candidates);
+    void recordCallCandidates(FunctionGroup *FG);
+    void recordCallArgCandidates(Value *Dest, unsigned ArgNum,
+                                 ArrayRef<Instruction *> Insts);
+    // Functions for processing coalecing candidates.
+    void processCopyCandidate(Candidate *Cand) {
+      processCandidate(Cand, true /*IsCopy*/);
+    }
+    void processCandidate(Candidate *Cand, bool IsCopy = false);
+    void processPhiNodes(FunctionGroup *FG);
+    void analysePhiCopies(PHINode *Phi, std::vector<PhiCopy> &ToProcess);
+    void processPhiCopy(PHINode *Phi, unsigned Inc,
+                        std::vector<PHINode *> &Phis);
+    void processPhiBranchingJoinLabelCopy(PHINode *Phi, unsigned Inc,
+                                          std::vector<PHINode *> &Phis);
+    PHINode *copyNonCoalescedPhi(PHINode *PhiPred, PHINode *PhiSucc);
+    void processCalls(FunctionGroup *FG);
+    void processKernelArgs(FunctionGroup *FG);
+    void coalesceOutputArgs(FunctionGroup *FG);
+    void coalesceCallables();
+    void coalesceGlobalLoads(FunctionGroup *FG);
+    Instruction *insertCopy(SimpleValue Input, LiveRange *LR,
+                            Instruction *InsertBefore, StringRef Name,
+                            unsigned Number);
+    Instruction *insertIntoStruct(Type *Ty, unsigned FlattenedIndex,
+                                  Value *OldStruct, Instruction *NewVal,
+                                  Instruction *InsertBefore);
+    void showCoalesceFail(SimpleValue V, const DebugLoc &DL, const char *Intro,
+                          LiveRange *DestLR, LiveRange *SourceLR);
+    // Helpers
+    DominatorTree *getDomTree(Function *F) { return DTWrapper->getDomTree(F); }
+  };
+
+} // end anonymous namespace
+
+char GenXCoalescing::ID = 0;
+namespace llvm {
+void initializeGenXCoalescingPass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXCoalescing, "GenXCoalescing", "GenXCoalescing", false, false)
+INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_DEPENDENCY(GenXNumbering)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass);
+INITIALIZE_PASS_END(GenXCoalescing, "GenXCoalescing", "GenXCoalescing", false, false)
+
+FunctionGroupPass *llvm::createGenXCoalescingPass() {
+  initializeGenXCoalescingPass(*PassRegistry::getPassRegistry());
+  return new GenXCoalescing();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run the coalescing pass for this FunctionGroup
+ */
+bool GenXCoalescing::runOnFunctionGroup(FunctionGroup &FG)
+{
+  // Get analyses that we use and/or modify.
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  ST = P ? P->getSubtarget() : nullptr;
+  Baling = &getAnalysis<GenXGroupBaling>();
+  Liveness = &getAnalysis<GenXLiveness>();
+  Numbering = &getAnalysis<GenXNumbering>();
+  DTWrapper = &getAnalysis<DominatorTreeGroupWrapperPass>();
+
+  // Coalesce all global loads prior to normal coalescing.
+  coalesceGlobalLoads(&FG);
+
+  // Record all the coalescing candidates except the call arg and return
+  // value pre-copy ones.
+  recordCandidates(&FG);
+
+  // Process the copy coalescing candidates.
+  for (unsigned i = 0; i != CopyCandidates.size(); ++i)
+    processCopyCandidate(&CopyCandidates[i]);
+
+  // Record the call arg and return value pre-copy candidates.
+  recordCallCandidates(&FG);
+
+  // Sort the array of normal coalescing candidates (including phi ones) then
+  // process them.
+  std::sort(NormalCandidates.begin(), NormalCandidates.end());
+  for (unsigned i = 0; i != NormalCandidates.size(); ++i)
+    processCandidate(&NormalCandidates[i]);
+
+  // Now scan all phi nodes again, inserting copies where necessary. Doing
+  // them in one go here ensures that the copies appear in the predecessor
+  // blocks in the same order as the phi nodes, which is the basis on which
+  // we computed live ranges.
+  processPhiNodes(&FG);
+
+  // Scan all the calls, inserting copies where necessary for call arg
+  // pre-copies and return value pre- and post-copies. Doing them in one go
+  // here ensures that the copies appear in the order that live range
+  // computation assumed they would appear. Also, for call arg and return
+  // value pre-copies, a single coalesce candidate is shared across multiple
+  // calls/returns using the same LR, so we need this separate scan to find
+  // the calls/returns.
+  processCalls(&FG);
+
+  // Add a copy for each kernel arg that is not aligned enough.
+  processKernelArgs(&FG);
+  coalesceCallables();
+  coalesceOutputArgs(&FG);
+
+  CopyCandidates.clear();
+  NormalCandidates.clear();
+  Callables.clear();
+  return true;
+}
+
+/***********************************************************************
+ * recordCandidates : record all the coalescing candidates from code
+ *
+ * This does not record call arg pre-copy or ret value pre-copy candidates.
+ * That is done in recordCallCandidates.
+ */
+void GenXCoalescing::recordCandidates(FunctionGroup *FG)
+{
+  for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    for (Function::iterator fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+      BasicBlock *BB = &*fi;
+      for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) {
+        Instruction *Inst = &*bi;
+        if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+          // Phi node. For each incoming, record a phi candidate, unless it is a
+          // registerless value (EM/RM).
+          // If the incoming block is a branching join label block, then we
+          // cannot insert any phi copies there, so give the coalescing
+          // candidate a high priority to ensure it gets coalesced first.
+          if (Liveness->getLiveRange(Phi)->getCategory()
+              < RegCategory::NUMREALCATEGORIES) {
+            for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+              auto IncomingBlock = Phi->getIncomingBlock(i);
+              unsigned Priority = getPriority(Phi->getType(), IncomingBlock);
+              if (GotoJoin::isBranchingJoinLabelBlock(IncomingBlock))
+                Priority = UINT_MAX;
+              recordPhiCandidate(Phi, &Phi->getOperandUse(i), Priority);
+            }
+          }
+        } else if (IGCLLVM::CallInst *CI = dyn_cast<IGCLLVM::CallInst>(Inst)) {
+          if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI)) {
+            if (CI->isInlineAsm()) {
+              InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+              // Do not process if no constraints provided or it's baled
+              // (the coalescing actually needs to be done at the wrregion).
+              if (IA->getConstraintString().empty() || Baling->isBaled(CI))
+                continue;
+              unsigned NumOutputs = genx::getInlineAsmNumOutputs(CI);
+              auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI);
+              // we need to coalesce if there is a '+' modifier
+              // because those operands are tied and have to be in the same
+              // registers
+              for (unsigned ArgNo = 0; ArgNo < ConstraintsInfo.size();
+                   ArgNo++) {
+                auto &Info = ConstraintsInfo[ArgNo];
+                if (!Info.isOutput() || !Info.hasMatchingInput())
+                  continue;
+                unsigned ActualIdx = Info.getMatchingInput() - NumOutputs;
+                auto OpInst = dyn_cast<Instruction>(CI->getOperand(ActualIdx));
+                if (!OpInst || Baling->isBaled(OpInst))
+                  continue;
+                Use *OpUse = &CI->getOperandUse(ActualIdx);
+                if (isa<StructType>(CI->getType())) {
+                  unsigned Priority = getPriority(
+                      IndexFlattener::getElementType(CI->getType(), ArgNo),
+                      CI->getParent());
+                  recordNormalCandidate(SimpleValue(CI, ArgNo), OpUse, 0,
+                                        Priority);
+                } else {
+                  recordNormalCandidate(
+                      CI, OpUse, 0,
+                      getPriority(CI->getType(), CI->getParent()));
+                }
+              }
+            } else if (CI->isIndirectCall())
+              continue;
+            // This is a non-intrinsic call. If it returns a value, mark
+            // (elements of) the return value for coalescing with the
+            // unified return value.
+            else if (!CI->getType()->isVoidTy()) {
+              for (unsigned i = 0,
+                  e = IndexFlattener::getNumElements(CI->getType());
+                  i != e; ++i)
+                recordNormalCandidate(SimpleValue(CI, i), 0, i,
+                    getPriority(IndexFlattener::getElementType(
+                        CI->getType(), i), CI->getParent()));
+            } else {
+              // handle callable kernel
+              Function *Callee = CI->getCalledFunction();
+              if (Callee->hasFnAttribute("CMCallable")) {
+                if (F->hasFnAttribute("CMCallable")) {
+                  DiagnosticInfoFastComposition Err(CI,
+                    "Callable function must not call another callable"
+                    " function", DS_Error);
+                  F->getContext().diagnose(Err);
+                }
+                Callables.push_back(CI);
+              }
+            }
+          } else {
+            int OperandNum = getTwoAddressOperandNum(CI);
+            if (OperandNum >= 0) {
+              // This is an intrinsic with a two address operand (including
+              // the case of operand 0 in wrregion). That operand has to be in
+              // the same register as the result.
+              if (Baling->isBaled(CI)) {
+                // The intrinsic is baled into a wrregion. The two address
+                // operand must also have a rdregion baled in whose input is
+                // the "old value" input of the wrregion, and the coalescing
+                // actually needs to be done at the wrregion.  That is handled
+                // when this pass reaches the wrregion, so we do not want to do
+                // anything here.
+                //
+                // it may also be baled into a g_store.
+                // assert(Baling->getBaleInfo(CI).isOperandBaled(OperandNum) &&
+                // "expecting rdregion to be baled in to the two addr operand");
+                continue;
+              }
+              // Normal unbaled twoaddr operand.
+              recordNormalCandidate(CI, &CI->getOperandUse(OperandNum),
+                  getPriority(CI->getType(), CI->getParent()));
+            }
+          }
+        } else if (isa<BitCastInst>(Inst) || isa<AddrSpaceCastInst>(Inst)) {
+          assert(!isa<StructType>(Inst->getType()) && "not expecting bitcast to struct");
+          assert(!isa<StructType>(Inst->getOperand(0)->getType()) && "not expecting bitcast from struct");
+          // The source and destination of a bitcast can copy coalesce,
+          // but only if it is not the case that the source is a phi and
+          // the destination has a use in a phi node in the same block and
+          // after the source's phi. If the above is the case, then we try
+          // and normal coalesce instead, which fails, leading to a copy
+          // being generated.
+          if (GenXLiveness::wrapsAround(Inst->getOperand(0), Inst)) {
+            recordNormalCandidate(Inst, &Inst->getOperandUse(0),
+                getPriority(Inst->getType(), Inst->getParent()));
+          } else if (Liveness->getLiveRangeOrNull(Inst)) {
+            recordCopyCandidate(Inst, &Inst->getOperandUse(0),
+                getPriority(Inst->getType(), Inst->getParent()));
+          }
+        } else if (auto EVI = dyn_cast<ExtractValueInst>(Inst)) {
+          // extractvalue: copy coalesce the element being extracted, as long as
+          // both source and destination have live ranges. The two cases where
+          // they don't are:
+          //  1. the source live range got removed in the code below that
+          //     handles undef elements in an insertvalue chain;
+          //  2. this is the extract of the !any(EM) result of a goto/join,
+          //     which does not have a live range because it is baled in to the
+          //     branch.
+          if (Liveness->getLiveRangeOrNull(Inst)) {
+          unsigned Index = IndexFlattener::flatten(
+              cast<StructType>(EVI->getAggregateOperand()->getType()),
+              EVI->getIndices());
+            if (Liveness->getLiveRangeOrNull(
+                  SimpleValue(Inst->getOperand(0), Index))) {
+              recordCopyCandidate(SimpleValue(EVI), &Inst->getOperandUse(0), Index,
+                  getPriority(EVI->getType(), EVI->getParent()));
+            }
+          }
+        } else if (auto IVI = dyn_cast<InsertValueInst>(Inst)) {
+          // insertvalue:
+          // First, if the struct value input is undef, scan the possible chain
+          // of insertvalues and remove the live range for any SimpleValue that
+          // is undef. We need to do this to stop a register being allocated
+          // later for a coalesced SimpleValue from a chain of insertvalues
+          // for a return where that element is never set.
+          auto ST = cast<StructType>(IVI->getType());
+          unsigned NumElements = IndexFlattener::getNumElements(ST);
+          if (isa<UndefValue>(IVI->getOperand(0))) {
+            SmallVector<bool, 8> IsDefined;
+            IsDefined.resize(NumElements, false);
+            // For each insertvalue in the chain:
+            for (auto ThisIVI = IVI; ThisIVI;) {
+              // For the element set by this one, set it as defined (unless the
+              // input is undef).
+              IsDefined[IndexFlattener::flatten(ST, ThisIVI->getIndices())]
+                = !isa<UndefValue>(IVI->getOperand(1));
+              // For any element that is still undef, remove its live range.
+              for (unsigned i = 0; i != NumElements; ++i)
+                if (!IsDefined[i])
+                  Liveness->removeValue(SimpleValue(ThisIVI, i));
+              if (!ThisIVI->hasOneUse())
+                break;
+              ThisIVI = dyn_cast<InsertValueInst>(ThisIVI->use_begin()->getUser());
+            }
+          }
+          // Copy coalesce the element being inserted and the other elements,
+          // as long as the appropriate live ranges did not get removed above.
+          unsigned Index = IndexFlattener::flatten(ST, IVI->getIndices());
+          for (unsigned i = 0; i != NumElements; ++i) {
+            if (!Liveness->getLiveRangeOrNull(SimpleValue(IVI, i)))
+              continue;
+            if (i == Index) {
+              if (Liveness->getLiveRangeOrNull(Inst->getOperand(1)))
+                recordCopyCandidate(SimpleValue(IVI, i), &Inst->getOperandUse(1), 0,
+                    getPriority(IVI->getOperand(1)->getType(), IVI->getParent()));
+            } else {
+              if (Liveness->getLiveRangeOrNull(SimpleValue(Inst->getOperand(0), i)))
+                recordCopyCandidate(SimpleValue(IVI, i), &Inst->getOperandUse(0), i,
+                    getPriority(IVI->getOperand(1)->getType(), IVI->getParent()));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * recordCallCandidates : record the call arg pre-copy and ret value
+ *                        pre-copy candidates
+ *
+ * This is done here, after copy coalescing has been done, so we can
+ * more accurately estimate the cost of not coalescing a candidate by
+ * summing the cost from each call site / return instruction that uses
+ * the same (copy coalesced) value.
+ */
+void GenXCoalescing::recordCallCandidates(FunctionGroup *FG)
+{
+  // For each subroutine...
+  for (auto fgi = FG->begin() + 1, fge = FG->end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    // Gather the call sites.
+    SmallVector<Instruction *, 8> CallSites;
+    for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui)
+      if (auto CI = dyn_cast<CallInst>(ui->getUser()))
+        CallSites.push_back(CI);
+    // For each arg...
+    unsigned ArgIdx = 0;
+    for (auto ai = F->arg_begin(), ae = F->arg_end();
+        ai != ae; ++ai, ++ArgIdx) {
+      Argument *Arg = &*ai;
+      if (Arg->use_empty())
+        continue; // Ignore unused arg.
+      // Record a coalesce candidate for each unique input LR for each
+      // struct element in the arg.
+      recordCallArgCandidates(Arg, ArgIdx, CallSites);
+    }
+    // Now scan for return value pre-copies.
+    if (F->getReturnType()->isVoidTy())
+      continue;
+    // Gather the return insts by looking at the terminator of each BB.
+    SmallVector<Instruction *, 8> RetInsts;
+    for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+      auto RetInst = dyn_cast<ReturnInst>(fi->getTerminator());
+      if (RetInst)
+        RetInsts.push_back(RetInst);
+    }
+    // Record a coalesce candidate for each unique input LR for each
+    // struct element in the return value.
+    recordCallArgCandidates(Liveness->getUnifiedRet(F), 0, RetInsts);
+  }
+}
+
+/***********************************************************************
+ * recordCallArgCandidates : common code for adding a candidate for each
+ *    struct element of a call arg or a return value pre-copy
+ *
+ * Enter:   Dest = destination Value; the Argument for a call arg, or the
+ *                 Function's unified return value for a ret pre-copy
+ *          ArgNum = argument number for call arg, 0 for ret pre-copy
+ *          Insts = array of call sites or return instructions
+ *
+ * For each struct element, this adds a coalesce candidate for each unique LR
+ * used as a call arg or return value.
+ */
+namespace { struct CallArg {
+  Use *U;
+  LiveRange *LR;
+  CallArg(Use *U, LiveRange *LR) : U(U), LR(LR) {}
+}; }
+void GenXCoalescing::recordCallArgCandidates(Value *Dest, unsigned ArgNum,
+    ArrayRef<Instruction *> Insts)
+{
+  for (unsigned StructIdx = 0,
+      StructEnd = IndexFlattener::getNumElements(Dest->getType());
+      StructIdx != StructEnd; ++StructIdx) {
+    // For each unique LR used as this arg at any call site, sum the
+    // cost and add a candidate.
+    SmallVector<CallArg, 8> CallArgs;
+    for (unsigned i = 0, ie = Insts.size(); i != ie; ++i) {
+      Use *U = &Insts[i]->getOperandUse(ArgNum);
+      CallArgs.push_back(CallArg(U,
+          Liveness->getLiveRangeOrNull(SimpleValue(*U, StructIdx))));
+    }
+    for (unsigned i = 0, ie = CallArgs.size(); i != ie; ++i) {
+      LiveRange *LR = CallArgs[i].LR;
+      if (!LR)
+        continue; // Already done this one (or it was an undef).
+      unsigned Priority = 0;
+      for (unsigned j = i, je = CallArgs.size(); j != je; ++j) {
+        if (LR != CallArgs[j].LR)
+          continue;
+        Priority += getPriority(nullptr, Insts[j]->getParent());
+        CallArgs[j].LR = 0; // Blank out so we can see we have done this one.
+      }
+      Use *U = CallArgs[i].U;
+      Priority *= getPriority(IndexFlattener::getElementType(
+            (*U)->getType(), StructIdx), 0);
+      recordNormalCandidate(SimpleValue(Dest, StructIdx),
+          U, StructIdx, Priority);
+    }
+  }
+}
+
+/***********************************************************************
+ * getPriority : get priority of coalescing candidate
+ *
+ * Enter:   Ty = type that would need to be copied if coalescing failed,
+ *               so we can estimate the copy cost. Can be nullptr, in which
+ *               case the copy cost is assumed to be 1
+ *          BB = basic block where copy would be inserted, so we can use
+ *               loop depth to adjust the cost. Can be nullptr, in which
+ *               the loop depth is assumed to be 0
+ *
+ * Return:  priority (estimate of cost of inserting a copy)
+ *
+ * getPriority(Ty, BB) is equivalent to getPriority(Ty, 0) * getPriority(0, BB).
+ */
+unsigned GenXCoalescing::getPriority(Type *Ty, BasicBlock *BB)
+{
+  // Set priority to the number of GRFs.
+  // FIXME this should also take into account a non power of two
+  // vector size, which would result in multiple copy instructions.
+  // See GenXCoalescing::insertCopy.
+  // FIXME scale by loop depth.
+  unsigned Priority = 1;
+  if (Ty) {
+    if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+      Priority = VT->getNumElements() * VT->getElementType()->getPrimitiveSizeInBits();
+      Priority = (Priority + 255) / 256;
+    }
+  }
+  return Priority;
+}
+
+/***********************************************************************
+ * recordCandidate : record a candidate for coalescing
+ *
+ * Enter:   Dest = destination of copy
+ *          UseInDest = pointer to the use of the source in Dest
+ *          SourceIndex = flattened index of element in source struct
+ *          Priority = priority of coalescing this candidate
+ *          Candidates = vector of candidates to push to
+ *
+ * For call arg coalescing, Dest is the subroutine's Argument, and
+ * UseInDest/SourceIndex are the use in one of the possibly many call sites
+ * using the same source value.
+ *
+ * For ret value pre-copy coalescing (before the return inst), Dest is the the
+ * unified return value, and UseInDest/SourceIndex are the use in one of the
+ * possibly many return instructions using the same source value.
+ *
+ * For ret value post-copy coalescing (after the call inst), Dest is the
+ * CallInst, and UseInDest and SourceIndex are 0.
+ */
+void GenXCoalescing::recordCandidate(SimpleValue Dest, Use *UseInDest,
+    unsigned SourceIndex, unsigned Priority, std::vector<Candidate> *Candidates)
+{
+  if (UseInDest && isa<UndefValue>(*UseInDest))
+    return;
+  assert(!UseInDest || !isa<Constant>(*UseInDest));
+  Candidates->push_back(Candidate(Dest, UseInDest, SourceIndex, Priority,
+        Candidates->size()));
+}
+
+/***********************************************************************
+ * processCandidate : process a coalescing candidate
+ *
+ * This attempts to coalesce the candidate. On failure, it inserts a copy
+ * if necessary:
+ *
+ *  - a copy candidate never fails to coalesce;
+ *  - a two address candidate needs a copy and it is inserted here;
+ *  - a phi candidate needs a copy, but it is not inserted here. Instead it
+ *    is inserted later so we can ensure that multiple copies inserted at
+ *    the end of an incoming block are in phi node order, which was the
+ *    assumption made by the live range calculation.
+ *
+ * See the comment at the top of recordCandidate for the special values of
+ * fields in Candidate for a call arg coalesce and a ret value coalesce.
+ */
+void GenXCoalescing::processCandidate(Candidate *Cand, bool IsCopy)
+{
+  SimpleValue Dest = Cand->Dest;
+  SimpleValue Source;
+  if (!Cand->UseInDest) {
+    // This is a return value post-copy coalesce candidate. The actual source
+    // is the unified return value.
+    Source = SimpleValue(Liveness->getUnifiedRet(cast<CallInst>(
+            Dest.getValue())->getCalledFunction()), Cand->SourceIndex);
+  } else
+    Source = SimpleValue(*Cand->UseInDest, Cand->SourceIndex);
+  LLVM_DEBUG(dbgs() << "Trying coalesce from ";
+      Source.printName(dbgs());
+      dbgs() << " to ";
+      Dest.printName(dbgs());
+      dbgs() << " priority " << Cand->Priority;
+      if (isa<Argument>(Dest.getValue()))
+        dbgs() << " (call arg)";
+      else if (Liveness->isUnifiedRet(Dest.getValue()))
+        dbgs() << " (ret pre-copy)";
+      else if (!Cand->UseInDest)
+        dbgs() << " (ret post-copy)";
+      dbgs() << "\n");
+  LiveRange *DestLR = Liveness->getLiveRange(Dest);
+  LiveRange *SourceLR = 0;
+  // Source should not be a constant (but could be undef) because
+  // GenXLowering ensured that all our two address operands and phi incomings
+  // are not constant.
+  assert(!Cand->UseInDest || !isa<Constant>(Source.getValue()) || isa<UndefValue>(Source.getValue()));
+  SourceLR = Liveness->getLiveRange(Source);
+  assert(DestLR);
+  if (SourceLR == DestLR)
+    return; // already coalesced
+  if (SourceLR && SourceLR->Category == DestLR->Category) {
+    if (IsCopy) {
+      // For a copy candidate, we can coalesce if the source and destination do
+      // not copy-interfere, i.e. we do not have a situation where DestLR
+      // wraps round a loop into a phi use in the same basic block as the phi
+      // def of SourceLR but after it.
+      if (!Liveness->copyInterfere(SourceLR, DestLR)) {
+        Liveness->coalesce(DestLR, SourceLR, /*DisallowCASC=*/ false);
+        return;
+      }
+    } else {
+      // For a normal candidate, we can coalesce if the source and destination
+      // do not interfere, i.e. there is no point in the program where both
+      // LRs are live.
+      if (!Liveness->twoAddrInterfere(DestLR, SourceLR)) {
+        // In the coalesce, disallow future call arg special coalescing if this
+        // is not a call arg coalesce.
+        Liveness->coalesce(DestLR, SourceLR,
+            /*DisallowCASC=*/ !isa<Argument>(Dest.getValue()));
+        return;
+      }
+    }
+  }
+#if 0
+  // Disable call arg special coalescing for now, as it seems to break the FRC_MC example.
+
+  if (isa<Argument>(Dest.getValue())
+      && SourceLR->Category == DestLR->Category) {
+    // This is an attempt at call arg coalescing. The two LRs interfere, but
+    // we can still try for "call arg special coalescing" (CASC). See the
+    // comment at the top of the file.
+    if (!DestLR->DisallowCASC) {
+      // CASC not disallowed. (It would have been disallowed if DestLR had
+      // already participated in normal coalescing other than CASC.)
+      // For any call site where SourceLR is not the corresponding call arg,
+      // check that A is not live.
+      auto ThisCallSite = cast<CallInst>(Cand->UseInDest->getUser());
+      auto Callee = ThisCallSite->getCalledFunction();
+      bool FailedCASC = false;
+      for (auto ui = Callee->use_begin(), ue = Callee->use_end();
+          ui != ue; ++ui) {
+        auto CallSite = cast<CallInst>(ui->getUser());
+        if (CallSite == ThisCallSite)
+          continue;
+        auto OtherArg = SimpleValue(CallSite->getArgOperand(cast<Argument>(
+                Dest.getValue())->getArgNo()), Dest.getIndex());
+        auto OtherLR = Liveness->getLiveRange(OtherArg);
+        // Check whether OtherArg is the same as SourceLR. This check covers
+        // several cases:
+        // 1. OtherArg == SourceLR: the other arg is already coalesced with
+        //    our arg, so it would be OK to do CASC.
+        // 2. OtherArg is DestLR, meaning that the other call arg has already
+        //    been coalesced with the func arg. We cannot do CASC if SourceLR
+        //    and OtherArg interfere, which they do because we already know
+        //    that DestLR interferes with SourceLR.
+        // 3. OtherArg is something else, meaning that some other value will
+        //    be copied to the func arg here. We cannot do CASC if SourceLR
+        //    and OtherArg interfere.
+        if (OtherLR == SourceLR)
+          continue;
+        if (Liveness->interfere(OtherLR, SourceLR)) {
+          FailedCASC = true;
+          break;
+        }
+      }
+      if (!FailedCASC) {
+        // Can coalesce. Do not disallow future CASC.
+        Liveness->coalesce(DestLR, SourceLR, /*DisallowCASC=*/ false);
+        return;
+      }
+    }
+  }
+#endif
+
+  // Coalescing failed.
+  LLVM_DEBUG(
+    if (SourceLR) {
+      dbgs() << "Live ranges \"";
+      DestLR->print(dbgs());
+      dbgs() << "\" and \"";
+      SourceLR->print(dbgs());
+      dbgs() << "\"" << (IsCopy ? " copy" : "") << " interfere, not coalescing\n";
+    } else {
+      dbgs() << "Need copy of constant \"";
+      Source.print(dbgs());
+      dbgs() << "\" to \"";
+      Dest.printName(dbgs());
+      dbgs() << "\"\n";
+    }
+  );
+  if (isa<PHINode>(Dest.getValue()))
+    return; // Candidate is phi; copy insertion done later.
+  if (isa<Argument>(Dest.getValue()))
+    return; // Call arg pre-copy, defer copy insertion
+  if (Liveness->isUnifiedRet(Dest.getValue()))
+    return; // Return value pre-copy, defer copy insertion
+  if (!Cand->UseInDest)
+    return; // Return value post-copy, defer copy insertion
+  if (isa<BitCastInst>(Dest.getValue()) || isa<AddrSpaceCastInst>(Dest.getValue())) {
+    // A bitcast is normally copy coalesced, which means it cannot fail to
+    // coalesce. However, if the source is a phi node and the destination
+    // wraps round the loop and is used in another phi node in the same
+    // block that is later than the first phi node, then we instead
+    // try to normal coalesce, which fails because they interfere.
+    // This happens with a bitcast inserted in GenXLiveRanges to resolve
+    // an overlapping circular phi, but can happen in other cases too.
+    if ((int)genx::exactLog2(
+          Dest.getValue()->getType()->getPrimitiveSizeInBits()) <= 8) {
+      // This is a bitcast with a legal size for a single copy. We do not
+      // insert a copy, because GenXVisaFuncWriter will generate one.
+      // (GenXLegalization does not legalize a bitcast, so it can be
+      // illegal size here. We do that on the basis that a bitcast is
+      // normally copy coalesced.)
+      return;
+    }
+    // Otherwise, it is a bitcast of size more than 1 GRF or non-power-of-two,
+    // so we insert a copy.
+  }
+  // Insert the copy now for a two address op. Give it the number of the
+  // pre-copy slot, which is one less than the number of the two address
+  // instruction.
+  Instruction *DestInst = cast<Instruction>(Dest.getValue());
+  showCoalesceFail(Dest, DestInst->getDebugLoc(), "two address",
+      DestLR, SourceLR);
+  Instruction *NewCopy = insertCopy(Source, DestLR, DestInst, "twoaddr",
+      Numbering->getNumber(DestInst) - 1);
+  NewCopy = insertIntoStruct(Dest.getValue()->getType(),
+      Dest.getIndex(), *Cand->UseInDest, NewCopy, DestInst);
+  // Replace the use of the old source.
+  *Cand->UseInDest = NewCopy;
+  // No need to extend the live range, as the result of the two address op was
+  // already marked as defined at the pre-copy slot.
+}
+
+/***********************************************************************
+ * processPhiNodes : add copies for uncoalesced phi node incomings
+ */
+void GenXCoalescing::processPhiNodes(FunctionGroup *FG)
+{
+  std::vector<PhiCopy> PhiCopies;
+
+  for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    for (Function::iterator fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+      BasicBlock *BB = &*fi;
+      for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) {
+        // Scan the phi nodes at the start of this BB, if any.
+        PHINode *Phi = dyn_cast<PHINode>(&*bi);
+        if (!Phi)
+          break;
+
+        // Collect copies to process
+        analysePhiCopies(Phi, PhiCopies);
+      }
+    }
+  }
+
+  // Perform copy of uncoalesced phi node incomings.
+  // New phis can be created during this, store them.
+  std::vector<PHINode *> NewPhis;
+  for (auto Elem : PhiCopies) {
+    processPhiCopy(Elem.Phi, Elem.IncomingIdx, NewPhis);
+  }
+  // Phi copies are resolved. Clean the list.
+  PhiCopies.clear();
+
+  // Process newly created phis. This loop is executed
+  // when coalescing failed to resolve issues with phis
+  // in branching join label blocks. Such situation is
+  // very rare because coalescing tries to solve it
+  // with the highest priority.
+  while (!NewPhis.empty()) {
+    // Collect phi copy candidates
+    for (auto *Phi : NewPhis) {
+      analysePhiCopies(Phi, PhiCopies);
+    }
+    // Phi copies are collected, clean current Phis worklist
+    NewPhis.clear();
+
+    // Perform copy of uncoalesced phi node incomings.
+    for (auto Elem : PhiCopies) {
+      processPhiCopy(Elem.Phi, Elem.IncomingIdx, NewPhis);
+    }
+    // Phi copies are resolved. Clean the list.
+    PhiCopies.clear();
+  }
+}
+
+/***********************************************************************
+ * analysePhiCopies : for one phi node, collect copies for uncoalesced incomings
+ */
+void GenXCoalescing::analysePhiCopies(PHINode *Phi,
+                                      std::vector<PhiCopy> &ToProcess) {
+  // Scan each incoming to see if it was successfully coalesced.
+  LiveRange *DestLR = Liveness->getLiveRange(Phi);
+  if (DestLR->getCategory() >= RegCategory::NUMREALCATEGORIES)
+    return; // Ignore phi node of EM/RM value.
+  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+    Value *Incoming = Phi->getIncomingValue(i);
+    // Incoming should not be a constant (but could be undef) because
+    // GenXPostLegalization and GenXCategory called loadNonSimpleConstants
+    // to load the non-simple constant incomings, then GenXCategory also
+    // called GenXConstants::loadConstant for each remaining (simple)
+    // constant.
+    if (isa<UndefValue>(Incoming))
+      continue; // undef, no copy needed
+    assert(!isa<Constant>(Incoming));
+    if (Liveness->getLiveRange(Incoming) == DestLR)
+      continue; // coalesced, no copy needed
+    // A phi copy is needed
+    auto IncomingBlock = Phi->getIncomingBlock(i);
+    LLVM_DEBUG(dbgs() << "Need phi copy " << Incoming->getName() << " -> "
+                      << Phi->getName() << " in " << IncomingBlock->getName()
+                      << "\n");
+    ToProcess.push_back(PhiCopy(Phi, i));
+  }
+}
+
+/***********************************************************************
+ * processPhiCopy : for one phi node incoming, add copy
+ */
+void GenXCoalescing::processPhiCopy(PHINode *Phi, unsigned Inc,
+                                    std::vector<PHINode *> &Phis) {
+  LiveRange *DestLR = Liveness->getLiveRange(Phi);
+  Value *Incoming = Phi->getIncomingValue(Inc);
+  auto *IncomingBlock = Phi->getIncomingBlock(Inc);
+  // Should be checked in analysePhiCopies
+  assert(DestLR->getCategory() < RegCategory::NUMREALCATEGORIES &&
+         "Should be checked earlier!");
+  assert(!isa<UndefValue>(Incoming) && "Should be checked earlier!");
+  assert(!isa<Constant>(Incoming) && "Should be checked earlier!");
+  // Check it again: something could change
+  if (Liveness->getLiveRange(Incoming) == DestLR) {
+    LLVM_DEBUG(dbgs() << "Already coalesced " << Incoming->getName() << " -> "
+                      << Phi->getName() << " in " << IncomingBlock->getName()
+                      << "\n");
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Copying " << Incoming->getName() << " -> "
+                    << Phi->getName() << " in " << IncomingBlock->getName()
+                    << "\n");
+
+  // Handle branching join label block separately
+  if (GotoJoin::isBranchingJoinLabelBlock(IncomingBlock)) {
+    processPhiBranchingJoinLabelCopy(Phi, Inc, Phis);
+    return;
+  }
+
+  DominatorTree *DomTree = getDomTree(IncomingBlock->getParent());
+  Instruction *InsertPoint = IncomingBlock->getTerminator();
+  InsertPoint = GotoJoin::getLegalInsertionPoint(InsertPoint, DomTree);
+
+  // Give the copy the number allocated to the phi incoming
+  unsigned Num = Numbering->getPhiNumber(Phi, IncomingBlock);
+
+  if (auto *I = dyn_cast<Instruction>(Incoming)) {
+    // This should not happen for good BBs (not join blocks)
+    // if DFG is correct.
+    assert(DomTree->dominates(I->getParent(), InsertPoint->getParent()) &&
+           "Dominance corrupted!");
+  }
+
+  showCoalesceFail(SimpleValue(Incoming), InsertPoint->getDebugLoc(), "phi",
+                   DestLR, Liveness->getLiveRange(Incoming));
+  Instruction *NewCopy =
+      insertCopy(SimpleValue(Incoming), DestLR, InsertPoint, "phicopy", Num);
+  Phi->setIncomingValue(Inc, NewCopy);
+  // No need to extend the live range like we do in the two address op case
+  // in processCandidate(). The live range of a phi node already starts at
+  // each point where a copy might need to be inserted.
+}
+
+/***********************************************************************
+ * processPhiBranchingJoinLabelCopy : for one phi node incoming, add copy
+ * for branching join label incoming BB case
+ */
+void GenXCoalescing::processPhiBranchingJoinLabelCopy(
+    PHINode *Phi, unsigned Inc, std::vector<PHINode *> &Phis) {
+  LiveRange *DestLR = Liveness->getLiveRange(Phi);
+  Value *Incoming = Phi->getIncomingValue(Inc);
+  auto *IncomingBlock = Phi->getIncomingBlock(Inc);
+  // Should be checked in analysePhiCopies
+  assert(DestLR->getCategory() < RegCategory::NUMREALCATEGORIES &&
+         "Should be checked earlier!");
+  assert(!isa<UndefValue>(Incoming) && "Should be checked earlier!");
+  assert(!isa<Constant>(Incoming) && "Should be checked earlier!");
+  // Should be checked in processPhiCopy
+  assert(Liveness->getLiveRange(Incoming) != DestLR &&
+         "Should be checked earlier!");
+  assert(GotoJoin::isBranchingJoinLabelBlock(IncomingBlock) &&
+         "Should be checked earlier!");
+
+  LLVM_DEBUG(dbgs() << "Handling branching join label block case\n");
+
+  DominatorTree *DomTree = getDomTree(IncomingBlock->getParent());
+  Instruction *InsertPoint = IncomingBlock->getTerminator();
+  InsertPoint = GotoJoin::getLegalInsertionPoint(InsertPoint, DomTree);
+
+  // Give the copy the number of term to make proper liverange
+  unsigned Num = Numbering->getNumber(InsertPoint);
+
+  if (auto *PhiPred = dyn_cast<PHINode>(Incoming)) {
+    // In case when pred is Phi, it is possible to meet Phi in
+    // branching join blocks since such Phi does not brake
+    // SIMD CF Conformance. If such situation happens, we cannot
+    // perform copy of a phi value copy, we need to perform copy
+    // on all its incoming values. To do that, copy Phi and add
+    // it to Phis worklist.
+    //
+    // This situation is detected via corrupted dominance.
+    if (!DomTree->dominates(PhiPred->getParent(), InsertPoint->getParent())) {
+      auto *PhiCopy = copyNonCoalescedPhi(PhiPred, Phi);
+      assert(PhiCopy && "Invalid phi copy!");
+      Phis.push_back(PhiCopy);
+      return;
+    }
+  }
+
+  if (auto *I = dyn_cast<Instruction>(Incoming)) {
+    // This should not happen for good BBs (not join blocks)
+    // if DFG is correct.
+    //
+    // For join block, def must be somewhere before it
+    // because of SIMD CF Conformance. Case for Phi is
+    // described and handled above.
+    assert(DomTree->dominates(I->getParent(), InsertPoint->getParent()) &&
+           "Dominance corrupted!");
+  }
+
+  showCoalesceFail(SimpleValue(Incoming), InsertPoint->getDebugLoc(), "phi",
+                   DestLR, Liveness->getLiveRange(Incoming));
+  Instruction *NewCopy =
+      insertCopy(SimpleValue(Incoming), DestLR, InsertPoint, "phicopy", Num);
+  Phi->setIncomingValue(Inc, NewCopy);
+
+  // Extend liverange: we skipped some basic blocks
+  Liveness->rebuildLiveRange(DestLR);
+}
+
+/***********************************************************************
+ * copyNonCoalescedPhi : copy PhiPred and coalesce copy's LR with
+ * PhiSucc's LR
+ */
+PHINode *GenXCoalescing::copyNonCoalescedPhi(PHINode *PhiPred,
+                                             PHINode *PhiSucc) {
+  // Perform copy
+  auto *PhiCopy = cast<PHINode>(PhiPred->clone());
+  PhiCopy->insertBefore(PhiPred->getNextNode());
+  PhiCopy->setName(PhiPred->getName() + ".copy");
+  Numbering->setNumber(PhiCopy, Numbering->getNumber(PhiPred));
+
+  // Handle LRs
+  Liveness->buildLiveRange(PhiCopy);
+  LiveRange *DestLR = Liveness->getLiveRange(PhiSucc);
+  LiveRange *NewLR = Liveness->getLiveRange(PhiCopy);
+  Liveness->coalesce(DestLR, NewLR, false);
+
+  // Update incoming values
+  for (unsigned i = 0, e = PhiSucc->getNumIncomingValues(); i != e; ++i) {
+    Value *IncValue = PhiSucc->getIncomingValue(i);
+    if (IncValue == PhiPred)
+      PhiSucc->setIncomingValue(i, PhiCopy);
+  }
+
+  return PhiCopy;
+}
+
+/***********************************************************************
+ * processCalls : insert copies where necessary for call args and ret values
+ *
+ * This scans all the calls, inserting copies where necessary for call arg
+ * pre-copies and return value pre- and post-copies.
+ *
+ * We need to do them in one go here because
+ * 1. a call arg or return value pre-copy coalescing candidate covers
+ *    possibly multiple sites where the same LR input is used, without giving
+ *    any way of getting back to them all;
+ * 2. we want the inserted copies to be in the order that live range
+ *    computation assumed they would appear.
+ */
+void GenXCoalescing::processCalls(FunctionGroup *FG)
+{
+  // For each subroutine...
+  for (auto fgi = FG->begin() + 1, fge = FG->end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    // For each call site...
+    for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) {
+      if (auto *CI = dyn_cast<CallInst>(ui->getUser())) {
+        // For each func arg...
+        unsigned ArgIdx = 0;
+        for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+             ++ai, ++ArgIdx) {
+          Argument *Arg = &*ai;
+          if (Arg->use_empty()) {
+            // Arg is unused inside the subroutine. Do not try and process
+            // further, as its live range probably does not have a category.
+            continue;
+          }
+          Value *CallArg = CI->getOperand(ArgIdx);
+          if (isa<UndefValue>(CallArg)) {
+            // Call arg undefined. No coalescing needed.
+            continue;
+          }
+          // For each SimpleValue in the func arg...
+          for (unsigned StructIdx = 0,
+                        se = IndexFlattener::getNumElements(Arg->getType());
+               StructIdx != se; ++StructIdx) {
+            assert(!StructIdx &&
+                   "coalesce failure on struct call arg not tested");
+            auto FuncArgSV = SimpleValue(Arg, StructIdx);
+            auto CallArgSV = SimpleValue(CallArg, StructIdx);
+            // See if they are coalesced.
+            auto DestLR = Liveness->getLiveRange(FuncArgSV);
+            auto SourceLR = Liveness->getLiveRange(CallArgSV);
+            if (!DestLR || DestLR == SourceLR || F == CI->getFunction())
+              continue;
+            if (DestLR->getCategory() >= RegCategory::NUMREALCATEGORIES)
+              continue; // Called function arg is EM.
+            // Need to insert a copy. Give it the number of the arg's pre-copy
+            // slot.
+            showCoalesceFail(CallArgSV, CI->getDebugLoc(), "call arg", DestLR,
+                             SourceLR);
+            unsigned Num =
+                Numbering->getArgPreCopyNumber(CI, ArgIdx, StructIdx);
+            Instruction *NewCopy =
+                insertCopy(CallArgSV, DestLR, CI, "callarg.precopy", Num);
+            NewCopy = insertIntoStruct(Arg->getType(), StructIdx,
+                                       CI->getOperand(ArgIdx), NewCopy, CI);
+            // Replace operand in call.
+            CI->setOperand(ArgIdx, NewCopy);
+            // No need to extend the live range like we do in the two address op
+            // case in processCandidate(). The live range of a func arg already
+            // starts at each point where a copy might need to be inserted.
+          }
+        }
+        // Now check the return value post-copy.
+        //
+        // The code to handle a coalesce failure in a return value post-copy
+        // is different to all other cases of coalesce failure, which are
+        // pre-copy. We need to ensure that the post-copied value is in the
+        // original live range for the original value (the return value),
+        // and all the original value's users are changed to use the post-copied
+        // value instead. The original value (the return value) gets moved out
+        // of its live range and put into that of the unified return value.
+        //
+        // If the return value is a struct, all the above happens for each
+        // struct element, with the extra complication of more new values to
+        // handle because of the extractvalue and insertvalue instructions we
+        // need to insert.
+        //
+        // First remember all uses of the return value, because we want to
+        // replace them after adding new ones below. Remember if they are
+        // all extractvalue with a non-struct result (which should usually be
+        // the case because GenXLowering removes most structs).
+        SmallVector<Use *, 8> CIUses;
+        bool AllUsesAreExtract = isa<StructType>(CI->getType());
+        for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) {
+          auto EV = dyn_cast<ExtractValueInst>(ui->getUser());
+          if (!EV || isa<StructType>(EV->getType()))
+            AllUsesAreExtract = false;
+          CIUses.push_back(&*ui);
+        }
+        Instruction *InsertBefore = CI->getNextNode();
+        Value *StructValue = CI;
+        SmallVector<LiveRange *, 8> PreviousElements;
+        // For each SimpleValue in the return value...
+        for (unsigned StructIdx = 0,
+                      se = IndexFlattener::getNumElements(CI->getType());
+             StructIdx != se; ++StructIdx) {
+          auto UnifiedSV = SimpleValue(Liveness->getUnifiedRet(F), StructIdx);
+          auto SV = SimpleValue(CI, StructIdx);
+          // See if (the element in) the returned value is dead, or successfully
+          // coalesced with (the element in) the unified return value.
+          auto DestLR = Liveness->getLiveRangeOrNull(SV);
+          PreviousElements.push_back(DestLR);
+          if (!DestLR)
+            continue; // dead
+          auto SourceLR = Liveness->getLiveRange(UnifiedSV);
+          if (DestLR == SourceLR)
+            continue; // coalesced
+          assert(SourceLR);
+          if (SourceLR->getCategory() >= RegCategory::NUMREALCATEGORIES)
+            continue; // Unified return value is EM, ignore.
+          // Remove (the element of) CI, the actual return value, from its
+          // own live range, and add it instead to the unified return value.
+          // insertCopy() will add the new value to DestLR (what
+          // was the LR for the element of CI).
+          Liveness->removeValueNoDelete(SV);
+          Liveness->setLiveRange(SV, SourceLR);
+          // Need to insert a copy. Give it the number of the post-copy slot.
+          showCoalesceFail(SimpleValue(CI, StructIdx), CI->getDebugLoc(),
+                           "ret postcopy", DestLR, SourceLR);
+          unsigned Num = Numbering->getRetPostCopyNumber(CI, StructIdx);
+          Instruction *NewCopy =
+              insertCopy(SimpleValue(CI, StructIdx), DestLR, InsertBefore,
+                         "retval.postcopy", Num);
+          assert(NewCopy);
+          if (AllUsesAreExtract) {
+            // For a struct ret value where all the uses are non-struct
+            // extractvalue, replace uses of the extractvalues with NewCopy.
+            // Doing this, rather than calling insertIntoStruct() and letting
+            // the existing extractvalue extract it again, does not improve the
+            // code generated by the compiler (insertvalue/extractvalue do not
+            // generate any code), but it does make the IR simpler and easier
+            // to understand in a dump.
+            for (unsigned i = 0, e = CIUses.size(); i != e; ++i) {
+              if (!CIUses[i])
+                continue;
+              auto EV = cast<ExtractValueInst>(CIUses[i]->getUser());
+              if (StructIdx ==
+                  IndexFlattener::flatten(cast<StructType>(CI->getType()),
+                                          EV->getIndices())) {
+                NewCopy->takeName(EV);
+                EV->replaceAllUsesWith(NewCopy);
+                if (EV == InsertBefore)
+                  InsertBefore = InsertBefore->getNextNode();
+                Liveness->removeValue(SimpleValue(EV));
+                EV->eraseFromParent();
+                CIUses[i] = 0;
+              }
+            }
+          } else {
+            // If this is a struct return value, we also need to insertvalue,
+            // creating a new struct value.
+            StructValue = insertIntoStruct(CI->getType(), StructIdx,
+                                           StructValue, NewCopy, InsertBefore);
+            // Also, for this and previously seen elements that are not dead,
+            // add that element of StructValue (the new insertvalue) to the live
+            // range.
+            if (StructValue != NewCopy) {
+              for (unsigned k = 0, ke = PreviousElements.size(); k != ke; ++k) {
+                if (PreviousElements[k])
+                  Liveness->setLiveRange(SimpleValue(StructValue, k),
+                                         PreviousElements[k]);
+              }
+            }
+          }
+        }
+        if (!AllUsesAreExtract) {
+          // Replace uses of the whole return value that existed before we added
+          // more uses above.
+          for (unsigned i = 0, e = CIUses.size(); i != e; ++i)
+            *CIUses[i] = StructValue;
+        }
+      }
+    }
+    if (F->getReturnType()->isVoidTy())
+      continue; // no return value from this func
+    // For each return inst in the func...
+    for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+      auto RI = dyn_cast<ReturnInst>(fi->getTerminator());
+      if (!RI)
+        continue;
+      Value *Input = RI->getOperand(0);
+      if (isa<UndefValue>(Input))
+        continue;
+      Value *UnifiedRet = Liveness->getUnifiedRet(F);
+      // For each struct element in the return value...
+      for (unsigned StructIdx = 0,
+          StructEnd = IndexFlattener::getNumElements(UnifiedRet->getType());
+          StructIdx != StructEnd; ++StructIdx) {
+        auto DestLR = Liveness->getLiveRange(SimpleValue(UnifiedRet, StructIdx));
+        auto SourceLR = Liveness->getLiveRange(SimpleValue(Input, StructIdx));
+        if (DestLR == SourceLR)
+          continue; // coalesced
+        // Need to insert a copy. Give it the number of the ret pre-copy slot.
+        showCoalesceFail(SimpleValue(Input, StructIdx), RI->getDebugLoc(),
+              "ret precopy", DestLR, SourceLR);
+        unsigned Num = Numbering->getNumber(RI) - StructEnd + StructIdx;
+        Instruction *NewCopy = insertCopy(SimpleValue(Input, StructIdx),
+            DestLR, RI, "retval.precopy", Num);
+        NewCopy = insertIntoStruct(UnifiedRet->getType(), StructIdx,
+            RI->getOperand(0), NewCopy, RI);
+        // Replace operand in call.
+        RI->setOperand(0, NewCopy);
+        // No need to extend the live range like we do in the two address op
+        // case in processCandidate(). The live range of the unified return
+        // value already starts at each point where a copy might need to be
+        // inserted.
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * processKernelArgs : add a copy for each kernel arg that is not aligned enough
+ */
+void GenXCoalescing::processKernelArgs(FunctionGroup *FG)
+{
+  auto F = FG->getHead();
+  if (!isKernel(F))
+    return;
+  Instruction *InsertBefore = F->front().getFirstNonPHIOrDbg();
+  KernelMetadata KM(F);
+  unsigned Idx = 0;
+  for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ++ai) {
+    if (KM.shouldSkipArg(Idx++))
+      continue;
+    auto Arg = &*ai;
+    auto LR = Liveness->getLiveRange(Arg);
+    if (!(LR->Offset & ((1U << LR->LogAlignment) - 1)))
+      continue; // aligned enough
+    // Insert a copy and give the original arg its own new live range. This
+    // leaves the original live range still live from the start of the
+    // function, and thus interfering with the new live range for the arg,
+    // but that doesn't matter.
+    SmallVector<Use *, 4> Uses;
+    for (auto ui = Arg->use_begin(), ue = Arg->use_end(); ui != ue; ++ui)
+      Uses.push_back(&*ui);
+    unsigned Num = Numbering->getKernelArgCopyNumber(Arg);
+    auto Copy = insertCopy(Arg, LR, InsertBefore, "argcopy", Num);
+    Liveness->removeValueNoDelete(Arg);
+    for (auto ui = Uses.begin(), ue = Uses.end(); ui != ue; ++ui)
+      **ui = Copy;
+    auto NewLR = Liveness->getOrCreateLiveRange(Arg);
+    NewLR->setCategory(LR->getCategory());
+    NewLR->push_back(Segment(Numbering->getNumber(F), Num));
+    NewLR->Offset = LR->Offset;
+    LR->Offset = 0;
+  }
+}
+
+void GenXCoalescing::coalesceOutputArgs(FunctionGroup *FG) {
+  auto F = FG->getHead();
+  if (!isKernel(F))
+    return;
+
+  std::string Name = GenXIntrinsic::getGenXName(GenXIntrinsic::genx_output);
+  Function *OutputFn = F->getParent()->getFunction(Name);
+  if (!OutputFn)
+    return;
+
+  KernelMetadata KM(F);
+  for (auto U : OutputFn->users()) {
+    auto CI = dyn_cast<CallInst>(U);
+    if (!CI || CI->getParent()->getParent() != F)
+      continue;
+
+    unsigned Idx = 0; // kernel argument index
+    unsigned i = 0;   // call argument index
+    for (auto I = F->arg_begin(), E = F->arg_end(); I != E; ++I) {
+      if (!KM.isOutputArg(Idx++))
+        continue;
+
+      // This is the final value stored into the output argument.
+      // If this is coalesced into kernel argument, nothing to do.
+      // Otherwise, insert a copy.
+      Value *V = CI->getArgOperand(i);
+      Value *Arg = &*I;
+      LiveRange *LR1 = Liveness->getLiveRangeOrNull(V);
+      LiveRange *LR2 = Liveness->getLiveRange(Arg);
+
+      auto coalesceInput = [=]() {
+        // When LR1 is null, the input value should be Undef. Otherwise, it
+        // should be loaded as a constant.
+        if (LR1 == nullptr || LR1 == LR2)
+          return false;
+
+        if (!Liveness->interfere(LR1, LR2)) {
+          Liveness->coalesce(LR1, LR2, false);
+          return false;
+        }
+
+        // A copy is needed.
+        return true;
+      };
+
+      if (coalesceInput()) {
+        // Insert copy and add a short live range for copy-out.
+        unsigned Num = Numbering->getNumber(CI);
+        auto Copy = insertCopy(V, LR2, CI, "copyout", Num);
+        CI->setArgOperand(i, Copy);
+        LR2->push_back(Num, Num + 1);
+        LR2->sortAndMerge();
+      }
+      ++i;
+    }
+  }
+}
+
+void GenXCoalescing::coalesceCallables() {
+  for (auto CI : Callables) {
+    auto NI = CI->getNextNode();
+    // if the next instruction is a CM-output intrinsic,
+    // we don't really need that cm-output because CMCallable can serve as
+    // the anchor for preventing DCE
+    if (NI && isa<CallInst>(NI)) {
+      CallInst *OC = cast<CallInst>(NI);
+      if (GenXIntrinsic::getGenXIntrinsicID(OC) == GenXIntrinsic::genx_output) {
+        OC->eraseFromParent();
+      }
+    }
+
+    auto Nxt = CI->getNextNode();
+    auto Ret = Nxt;
+
+    // 1. Possible next node is branch to return
+    auto Br = dyn_cast<BranchInst>(Nxt);
+    if (Br && Br->isUnconditional())
+      Ret = &Br->getSuccessor(0)->front();
+
+    // 2. Possible next node is GenXIntrinsic::genx_output
+    if (GenXIntrinsic::getGenXIntrinsicID(Ret) == GenXIntrinsic::genx_output)
+      Ret = Ret->getNextNode();
+
+    // Check if next node is correct return insn
+    if (!Ret || !isa<ReturnInst>(Ret)) {
+      // getRetVal could not determine what happens to this return value.
+      DiagnosticInfoFastComposition Err(CI,
+        "Callable Call must be right before function return",
+        (ST->warnCallable() ? DS_Warning : DS_Error));
+      CI->getContext().diagnose(Err);
+    }
+    Function *F = CI->getParent()->getParent();
+    assert(isKernel(F));
+    KernelMetadata KM(F);
+    unsigned Idx = 0; // kernel argument index
+    unsigned i = 0;   // call argument index
+    for (auto I = F->arg_begin(), E = F->arg_end(); I != E; ++I) {
+      if (KM.getArgInputOutputKind(Idx++) == KernelMetadata::IO_Normal)
+        continue;
+
+      // This is the final value stored into the output argument.
+      // If this is coalesced into kernel argument, nothing to do.
+      // Otherwise, insert a copy.
+      Value *V = CI->getArgOperand(i);
+      Value *Arg = &*I;
+      LiveRange *LR1 = Liveness->getLiveRangeOrNull(V);
+      LiveRange *LR2 = Liveness->getLiveRange(Arg);
+
+      auto coalesceInput = [=]() {
+        // When LR1 is null, the input value should be Undef. Otherwise, it
+        // should be loaded as a constant.
+        if (LR1 == nullptr || LR1 == LR2)
+          return false;
+
+        if (!Liveness->interfere(LR1, LR2)) {
+          Liveness->coalesce(LR1, LR2, false);
+          return false;
+        }
+
+        // A copy is needed.
+        return true;
+      };
+
+      if (coalesceInput()) {
+        // Insert copy and add a short live range for copy-out.
+        unsigned Num = Numbering->getNumber(CI);
+        auto Copy = insertCopy(V, LR2, CI, "copyout", Num);
+        CI->setArgOperand(i, Copy);
+        LR2->push_back(Num, Num + 1);
+        LR2->sortAndMerge();
+      }
+      ++i;
+    }
+  }
+}
+
+void GenXCoalescing::coalesceGlobalLoads(FunctionGroup *FG) {
+  for (auto &GV : FG->getModule()->globals()) {
+    if (!GV.hasAttribute(genx::FunctionMD::GenXVolatile))
+      continue;
+    LiveRange *LR1 = Liveness->getLiveRangeOrNull(&GV);
+    if (!LR1)
+      continue;
+
+    // Collect all loads.
+    std::set<Instruction *> LoadsInGroup;
+    for (auto UI : GV.users()) {
+      if (auto LI = dyn_cast<LoadInst>(UI)) {
+        assert(LI->getPointerOperand() == &GV);
+        auto Fn = LI->getParent()->getParent();
+        // Check this load is inside the group.
+        if (std::find(FG->begin(), FG->end(), Fn) != FG->end())
+          LoadsInGroup.insert(LI);
+      }
+      // Global variable is used in a constexpr.
+      if (&GV != getUnderlyingGlobalVariable(UI))
+        continue;
+      for (auto U : UI->users())
+        if (auto LI = dyn_cast<LoadInst>(U)) {
+          auto Fn = LI->getParent()->getParent();
+          // Check this load is inside the group.
+          if (std::find(FG->begin(), FG->end(), Fn) != FG->end())
+            LoadsInGroup.insert(LI);
+        }
+    }
+
+    // Do coalescing.
+    for (auto LI : LoadsInGroup) {
+      LiveRange *LR2 = Liveness->getLiveRange(LI);
+      LR1 = Liveness->coalesce(LR1, LR2, false);
+    }
+  }
+}
+
+/***********************************************************************
+ * insertCopy : insert a copy of a non-struct value
+ *
+ * Enter:   Input = value to copy
+ *          LR = live range to add the new value to
+ *          InsertBefore = insert copy before this inst
+ *          Name = name to give the new value
+ *          Number = number to give the new instruction(s)
+ *
+ * Return:  The new copy instruction
+ *
+ * This inserts multiple copies if the input value is a vector that is
+ * bigger than two GRFs or a non power of two size.
+ */
+Instruction *GenXCoalescing::insertCopy(SimpleValue Input, LiveRange *LR,
+    Instruction *InsertBefore, StringRef Name, unsigned Number)
+{
+  assert(!isa<Constant>(Input.getValue()));
+  if (auto ST = dyn_cast<StructType>(Input.getValue()->getType())) {
+    // Input is a struct element. First extract it. This
+    // extract is created coalesced by adding it to the live
+    // range of the struct element. An extractvalue is always
+    // coalesced and never generates code.
+    SmallVector<unsigned, 4> Indices;
+    IndexFlattener::unflatten(ST, Input.getIndex(), &Indices);
+    Instruction *Extract = ExtractValueInst::Create(Input.getValue(), Indices,
+        "twoaddr.extract", InsertBefore);
+    auto SourceLR = Liveness->getLiveRange(Input);
+    assert(SourceLR);
+    Liveness->setLiveRange(SimpleValue(Extract), SourceLR);
+    Input = SimpleValue(Extract);
+  }
+  return Liveness->insertCopy(Input.getValue(), LR, InsertBefore, Name, Number);
+}
+
+/***********************************************************************
+ * insertIntoStruct : create an insertvalue to insert a new value into a
+ *                    struct
+ *
+ * Enter:   Ty = type of putative struct
+ *          FlattenedIndex = flattened index within the struct
+ *          OldStruct = old value of struct
+ *          NewVal = new value to insert into it
+ *          InsertBefore = where to insert new instruction before
+ *
+ * Return:  the new InsertValueInst
+ *
+ * If Ty is not a struct type, this just returns NewVal.
+ */
+Instruction *GenXCoalescing::insertIntoStruct(Type *Ty,
+    unsigned FlattenedIndex, Value *OldStruct, Instruction *NewVal,
+    Instruction *InsertBefore)
+{
+  auto ST = dyn_cast<StructType>(Ty);
+  if (!ST)
+    return NewVal;
+  // We're copying into struct element. We need to add an insertvalue.
+  SmallVector<unsigned, 4> Indices;
+  IndexFlattener::unflatten(ST, FlattenedIndex, &Indices);
+  return InsertValueInst::Create(OldStruct, NewVal,
+      Indices, "coalescefail.insert", InsertBefore);
+}
+
+/***********************************************************************
+ * showCoalesceFail : output a message to say that coalescing has failed
+ */
+void GenXCoalescing::showCoalesceFail(SimpleValue V, const DebugLoc &DL,
+                                      const char *Intro, LiveRange *DestLR,
+                                      LiveRange *SourceLR) {
+  if (isa<Constant>(V.getValue()))
+    return;
+  if (V.getType()->getPrimitiveSizeInBits() >=
+      GenXShowCoalesceFailThreshold * 8U) {
+    dbgs() << "GenX " << Intro << " coalesce failed on ";
+    V.printName(dbgs());
+    dbgs() << " size " << V.getType()->getPrimitiveSizeInBits() / 8U
+           << " bytes at ";
+    DL.print(dbgs());
+    dbgs() << "\nDestLR: " << *DestLR << "\nSourceLR: " << *SourceLR << "\n";
+  }
+}
+
+/***********************************************************************
+* DiagnosticInfoFastComposition initializer from Instruction
+*
+* If the Instruction has a DebugLoc, then that is used for the error
+* location.
+* Otherwise, the location is unknown.
+*/
+DiagnosticInfoFastComposition::DiagnosticInfoFastComposition(Instruction *Inst,
+  const Twine &Desc, DiagnosticSeverity Severity)
+  : DiagnosticInfo(getKindID(), Severity), Line(0), Col(0)
+{
+  auto DL = Inst->getDebugLoc();
+  if (!DL) {
+    Filename = DL->getFilename();
+    Line = DL.getLine();
+    Col = DL.getCol();
+  }
+  Description = (Twine("Fast Composition restriction violation")
+    + ": " + Desc).str();
+}
+
+/***********************************************************************
+* DiagnosticInfoFastComposition::print : print the error/warning message
+*/
+void DiagnosticInfoFastComposition::print(DiagnosticPrinter &DP) const
+{
+  std::string Loc(
+    (Twine(!Filename.empty() ? Filename : "<unknown>")
+    + ":" + Twine(Line)
+    + (!Col ? Twine() : Twine(":") + Twine(Col))
+    + ": ")
+    .str());
+  DP << Loc << Description;
+}
+
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.cpp
new file mode 100644
index 000000000000..389bdd75eb75
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.cpp
@@ -0,0 +1,1524 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXConstants
+/// -------------
+///
+/// GenXConstants is not in itself a pass. It contains utility functions and a
+/// class used by other passes for constant loading.
+///
+/// loadNonSimpleConstants
+/// ^^^^^^^^^^^^^^^^^^^^^^
+///
+/// The GenXPostLegalization pass calls loadNonSimpleConstants to insert a load
+/// for any operand that is a non-simple constant. (A non-simple constant is one
+/// that is too big or an invalid value for a constant operand.)
+///
+/// It is called in two places:
+///
+/// 1. in the GenXPostLegalization pass, run after legalization but
+///    before CSE, so CSE has an opportunity to common up loaded non-simple
+///    constants;
+/// 2. later on in GenXCategory, to mop up non-simple constant operands
+///    created by CSE's constant propagation.
+///
+/// This does not insert a load if the constant is "big simple" (that is, it is
+/// illegally wide but each legalized part of it is simple) and it is used in
+/// the "old value" operand of a wrregion, or as a call arg.  Inserting a load
+/// of such a constant here would allow the load to be CSEd, which would be
+/// counter productive as some of the uses would not be kill uses and so
+/// coalescing would fail there.
+///
+/// Phi incoming constants are not loaded here; they are loaded in
+/// loadPhiConstants called from GenXCategory. Phi constant loads do not need to
+/// participate in CSE as loadPhiConstants has its own commoning up tailored for
+/// phi nodes.
+///
+/// loadConstants
+/// ^^^^^^^^^^^^^
+///
+/// This is called from GenXCategory.  It inserts a load for each constant
+/// operand that is not allowed to be constant, but remains after
+/// loadNonSimpleConstants.
+///
+/// Phi incoming constants are not loaded here; they are loaded in
+/// loadPhiConstants called from GenXCategory.
+///
+/// loadPhiConstants
+/// ^^^^^^^^^^^^^^^^
+///
+/// This is called from GenXCategory, and it inserts loads for constant phi
+/// incomings, commoning up when possible and sensible.
+///
+/// Commoning up (inserting one load for multiple phi incomings with the same
+/// constant, across one or more phi nodes) proceeds as follows:
+///
+/// Firstly, we divide the phi nodes into _webs_, where each web is the maximal
+/// set of phi nodes that are related through phi nodes and two address
+/// instructions, so will be coalesced later on in the flow.
+///
+/// Secondly, for a single web, we look for multiple uses of the same constant.
+/// Such a constant has a load instruction inserted just once, at the end of the
+/// nearest common dominator of all the corresponding incoming blocks.
+///
+/// If that insert point is in an empty split critical edge block, we instead
+/// insert in the block above that, in the hope that the split critical edge
+/// block can be removed later.
+///
+/// ConstantLoader
+/// ^^^^^^^^^^^^^^
+///
+/// ConstantLoader is a class that represents a constant and information on how
+/// to load it. This is where analysis happens of whether it is a legal packed
+/// vector, or whether it needs multiple instructions to load it. It then has
+/// methods to insert the code to load the constant.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_CONSTANTS"
+
+#include "GenXConstants.h"
+#include "GenXGotoJoin.h"
+#include "GenXIntrinsics.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+
+/***********************************************************************
+ * loadConstantStruct : insert instructions to load a constant struct
+ */
+static Value *loadConstantStruct(Constant *C, Instruction *InsertBefore,
+                                 const GenXSubtarget *Subtarget) {
+  auto ST = cast<StructType>(C->getType());
+  Value *Agg = UndefValue::get(ST);
+  for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+    Constant *El = C->getAggregateElement(i);
+    if (isa<UndefValue>(El))
+      continue;
+    Value *LoadedEl = nullptr;
+    if (isa<StructType>(El->getType()))
+      LoadedEl = loadConstantStruct(El, InsertBefore, Subtarget);
+    else
+      LoadedEl = ConstantLoader(El, Subtarget).load(InsertBefore);
+    Agg = InsertValueInst::Create(Agg, LoadedEl, i, "loadstruct", InsertBefore);
+  }
+  return Agg;
+}
+
+/***********************************************************************
+ * loadNonSimpleConstants : for any non-simple or illegal size constant in
+ *      an instruction, load it.
+ *
+ * Enter:   Inst = instruction to find constant operands in
+ *          AddedInstructions = 0 else vector to push added instructions onto
+ *
+ * Return:  whether code was modified
+ *
+ * This does not load constants in a phi nodes. That is done in
+ * loadPhiConstants.
+ */
+bool genx::loadNonSimpleConstants(Instruction *Inst,
+    SmallVectorImpl<Instruction *> *AddedInstructions,
+    const GenXSubtarget* Subtarget)
+{
+  bool Modified = false;
+  if (isa<PHINode>(Inst))
+    return Modified;
+  // Omit call target operand of a call.
+  unsigned NumArgs = Inst->getNumOperands();
+  auto CI = dyn_cast<CallInst>(Inst);
+  if (CI)
+    NumArgs = CI->getNumArgOperands();
+  unsigned IID = GenXIntrinsic::getAnyIntrinsicID(Inst);
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    if (isa<Constant>(Inst->getOperand(i))) {
+      Use *U = &Inst->getOperandUse(i);
+      Constant *C = dyn_cast<Constant>(*U);
+      if (!C)
+        continue;
+      if (isa<UndefValue>(C))
+        continue;
+      if (isa<ConstantAggregateZero>(C))
+        continue;
+      if (opMustBeConstant(Inst, i))
+        continue;
+      ConstantLoader CL(C, Inst, AddedInstructions, Subtarget);
+      if (CL.needFixingSimple()) {
+        CL.fixSimple(i);
+        continue;
+      }
+      if (CL.isSimple())
+        continue;
+      // Do not load a "big simple" constant for the "old value of vector"
+      // input of a wrregion, so it does not get CSEd. CSEing it is
+      // counter-productive because, if it has multiple uses, it will
+      // need to be two-address copied by GenXCoalescing anyway.
+      if (GenXIntrinsic::isWrRegion(IID)
+          && i == GenXIntrinsic::GenXRegion::OldValueOperandNum
+          && CL.isBigSimple())
+        continue;
+      // Similarly, do not load a "big simple" constant for a call arg.
+      if (CI && IID == GenXIntrinsic::not_any_intrinsic && CL.isBigSimple())
+        continue;
+      *U = CL.loadBig(Inst);
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+bool genx::loadConstantsForInlineAsm(
+    CallInst *CI, SmallVectorImpl<Instruction *> *AddedInstructions,
+    const GenXSubtarget *Subtarget) {
+  assert(CI->isInlineAsm() && "Inline asm expected");
+  bool Modified = false;
+  auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI);
+  Use *U;
+  for (unsigned i = 0, e = ConstraintsInfo.size(), ArgNo = 0; i != e; ++i) {
+    auto &Info = ConstraintsInfo[i];
+    if (Info.isOutput())
+      continue;
+    U = &CI->getOperandUse(ArgNo);
+    ArgNo++;
+    if (auto C = dyn_cast<Constant>(*U)) {
+      if (!isa<UndefValue>(C)) {
+        switch (Info.getConstraintType()) {
+        default:
+          *U = ConstantLoader(C, nullptr, AddedInstructions, Subtarget).load(CI);
+          Modified = true;
+          break;
+        case ConstraintType::Constraint_n:
+        case ConstraintType::Constraint_i:
+        case ConstraintType::Constraint_F:
+          break;
+        }
+      }
+    }
+  }
+  return Modified;
+}
+
+
+
+/***********************************************************************
+ * loadConstants : load constants as required for an instruction
+ *
+ * This handles operands that are not allowed to be constant. A constant
+ * operand that needs loading because it is a non-simple constant is
+ * handled in loadNonSimpleConstants.
+ *
+ * This does not load constants in a phi nodes. That is done in
+ * loadPhiConstants.
+ */
+bool genx::loadConstants(Instruction *Inst,
+    const GenXSubtarget* Subtarget)
+{
+  bool Modified = false;
+  Use *U;
+  if (isa<PHINode>(Inst))
+    return Modified;
+  if (isa<BinaryOperator>(Inst) &&
+      Inst->getType()->getScalarType()->isIntegerTy(1)) {
+    // Predicate binary operator: disallow constant operands, except
+    // that xor with -1 is allowed.
+    for (unsigned oi = 0; oi != 2; ++oi)
+      if (auto C = dyn_cast<Constant>(Inst->getOperand(oi))) {
+        auto IsNot = [=]() {
+          if (oi != 1)
+            return false;
+          if (Inst->getOpcode() != Instruction::Xor)
+            return false;
+          if (!C->getType()->isVectorTy())
+            return C->isAllOnesValue();
+          Constant *C1 = C->getSplatValue();
+          return C1 && C1->isAllOnesValue();
+        };
+        if (!IsNot()) {
+          Inst->setOperand(oi, ConstantLoader(C, Subtarget).load(Inst));
+          Modified = true;
+        }
+      }
+  }
+  if (isa<SelectInst>(Inst)) {
+    // select: disallow constant selector
+    U = &Inst->getOperandUse(0);
+    if (auto C = dyn_cast<Constant>(*U)) {
+      *U = ConstantLoader(C, Subtarget).load(Inst);
+      Modified = true;
+    }
+    return Modified;
+  }
+  if (isa<InsertValueInst>(Inst)) {
+    // insertvalue (inserting a value into a struct): disallow constant
+    // on element operand.
+    U = &Inst->getOperandUse(1);
+    if (auto C = dyn_cast<Constant>(*U)) {
+      *U = ConstantLoader(C, Subtarget).load(Inst);
+      Modified = true;
+    }
+    // Also disallow constant (other than undef) on old struct value operand.
+    // We need to load each non-undef element separately.
+    U = &Inst->getOperandUse(0);
+    if (auto C = dyn_cast<Constant>(*U))
+      if (!isa<UndefValue>(C))
+        *U = loadConstantStruct(C, Inst, Subtarget);
+    return Modified;
+  }
+  if (auto Br = dyn_cast<BranchInst>(Inst)) {
+    // Conditional branch: disallow constant condition.
+    if (Br->isConditional()) {
+      if (auto C = dyn_cast<Constant>(Br->getCondition())) {
+        Br->setCondition(ConstantLoader(C, Subtarget).load(Br));
+        Modified = true;
+      }
+    }
+    return Modified;
+  }
+  if (auto Ret = dyn_cast<ReturnInst>(Inst)) {
+    // Return: disallow constant return value in a subroutine (internal
+    // linkage).
+    if (Ret->getNumOperands() && Ret->getParent()->getParent()->getLinkage()
+          == GlobalValue::InternalLinkage) {
+      if (auto C = dyn_cast<Constant>(Ret->getOperand(0))) {
+        if (!C->getType()->isVoidTy())
+          Ret->setOperand(0, ConstantLoader(C, Subtarget).load(Ret));
+      }
+    }
+    return Modified;
+  }
+  auto CI = dyn_cast<CallInst>(Inst);
+  if (!CI)
+    return Modified;
+  if (CI->isInlineAsm())
+    return loadConstantsForInlineAsm(CI, nullptr, Subtarget);
+  int IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(CI);
+  switch (IntrinsicID) {
+    case GenXIntrinsic::not_any_intrinsic:
+    case Intrinsic::fma:
+    case GenXIntrinsic::genx_ssmad:
+    case GenXIntrinsic::genx_sumad:
+    case GenXIntrinsic::genx_usmad:
+    case GenXIntrinsic::genx_uumad:
+    case GenXIntrinsic::genx_output:
+      // load all args for subroutine and some intrinsic calls.
+      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+        U = &CI->getOperandUse(i);
+        if (auto C = dyn_cast<Constant>(*U)) {
+          if (!isa<UndefValue>(C)) {
+            *U = ConstantLoader(C, Subtarget).loadBig(CI);
+            Modified = true;
+          }
+        }
+      }
+      break;
+    case GenXIntrinsic::genx_constanti:
+    case GenXIntrinsic::genx_constantf:
+      break;
+    case GenXIntrinsic::genx_absi:
+    case GenXIntrinsic::genx_absf:
+      // abs modifier: disallow constant input.
+      U = &CI->getOperandUse(0);
+      if (auto C = dyn_cast<Constant>(*U)) {
+        *U = ConstantLoader(C, Subtarget).load(CI);
+        Modified = true;
+      }
+      break;
+    case GenXIntrinsic::genx_rdpredregion:
+    case GenXIntrinsic::genx_any:
+    case GenXIntrinsic::genx_all:
+      // rdpredregion, any, all: disallow constant input
+      U = &CI->getOperandUse(0);
+      if (auto C = dyn_cast<Constant>(*U)) {
+        *U = ConstantLoader(C, Subtarget).load(CI);
+        Modified = true;
+      }
+      break;
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf:
+      // rdregion: disallow constant input
+      U = &CI->getOperandUse(0);
+      if (auto C = dyn_cast<Constant>(*U)) {
+        *U = ConstantLoader(C, Subtarget).loadBig(CI);
+        Modified = true;
+      }
+      // Also disallow constant vector index (constant scalar OK).
+      U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::RdIndexOperandNum);
+      if (auto C = dyn_cast<Constant>(*U)) {
+        if (isa<VectorType>(C->getType())) {
+          *U = ConstantLoader(C, Subtarget).load(CI);
+          Modified = true;
+        }
+      }
+      break;
+    case GenXIntrinsic::genx_wrpredpredregion:
+      // wrpredpred: disallow constant "old vector" input unless undef
+      U = &CI->getOperandUse(0);
+      if (auto C = dyn_cast<Constant>(*U)) {
+        if (!isa<UndefValue>(C)) {
+          *U = ConstantLoader(C, Subtarget).loadBig(CI);
+          Modified = true;
+        }
+      }
+      break;
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+      // wrregion: disallow constant "old vector" input unless undef
+      U = &CI->getOperandUse(0);
+      if (auto C = dyn_cast<Constant>(*U)) {
+        if (!isa<UndefValue>(C)) {
+          *U = ConstantLoader(C, Subtarget).loadBig(CI);
+          Modified = true;
+        }
+      }
+      // Also disallow constant vector index (constant scalar OK).
+      U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+      if (auto C = dyn_cast<Constant>(*U)) {
+        if (isa<VectorType>(C->getType())) {
+          *U = ConstantLoader(C, Subtarget).load(CI);
+          Modified = true;
+        }
+      }
+      // Also disallow constant predicate unless all ones.
+      U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::PredicateOperandNum);
+      if (auto C = dyn_cast<Constant>(*U)) {
+        if (!C->isAllOnesValue()) {
+          *U = ConstantLoader(C, Subtarget).load(CI);
+          Modified = true;
+        }
+      }
+      break;
+    case GenXIntrinsic::genx_simdcf_goto:
+      // goto: disallow constant predicate input, unless it is all 0. We want to
+      // allow constant all 0, as it is the encoding used for an "else", and
+      // loading the constant into a predicate register stops the finalizer's
+      // structurizer working.
+      U = &CI->getOperandUse(2);
+      if (auto C = dyn_cast<Constant>(*U)) {
+        if (!C->isNullValue()) {
+          *U = ConstantLoader(C, Subtarget).load(CI);
+          Modified = true;
+        }
+      }
+      break;
+    default:
+      // Intrinsic: check intrinsic descriptor to see where constant args
+      // are allowed.
+      // Iterate through each field in the intrinsic info.
+      GenXIntrinsicInfo II(IntrinsicID);
+      // Intrinsic not found.
+      if (II.isNull())
+        return Modified;
+      unsigned MaxRawOperands = II.getTrailingNullZoneStart(CI);
+      for (GenXIntrinsicInfo::iterator i = II.begin(), e = II.end(); i != e; ++i) {
+        GenXIntrinsicInfo::ArgInfo AI = *i;
+        if (!AI.isArgOrRet() || AI.isRet())
+          continue;
+        // This field relates to an operand.
+        U = &CI->getOperandUse(AI.getArgIdx());
+        auto C = dyn_cast<Constant>(*U);
+        if (!C)
+          continue;
+        // Operand is constant.
+        // Allow constant if it is i1 or vector of i1 set to all ones; this
+        // represents an "all true" predication field.
+        if (C->getType()->getScalarType()->isIntegerTy(1) && C->isAllOnesValue())
+          continue;
+        // Allow constant if intrinsic descriptor allows it for this arg.
+        if (!AI.isImmediateDisallowed())
+          continue;
+        // If it is a RAW operand, allow the constant if it's in the trailing
+        // null region (it must be a null constant if so), or if the value
+        // is undefined and RAW_NULLALLOWED is enabled.
+        if (AI.isRaw()) {
+          if ((unsigned)AI.getArgIdx() >= MaxRawOperands) {
+            assert(C->isNullValue());
+            continue;
+          }
+          if (isa<UndefValue>(C) && AI.rawNullAllowed())
+            continue;
+        }
+        // Also allow constant if it is undef in a TWOADDR
+        if (isa<UndefValue>(C) && AI.getCategory() == GenXIntrinsicInfo::TWOADDR)
+          continue;
+        // Also allow constant if it is a reserved surface index.
+        if (AI.getCategory() == GenXIntrinsicInfo::SURFACE &&
+            visa::isReservedSurfaceIndex(visa::convertToSurfaceIndex(C))) {
+          continue;
+        }
+        // Operand is not allowed to be constant. Insert code to load it.
+        *U = ConstantLoader(C, Subtarget).loadBig(CI);
+        Modified = true;
+      }
+      break;
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * loadPhiConstants : load constant incomings in phi nodes, commoning up
+ *      if appropriate
+ */
+bool genx::loadPhiConstants(Function *F, DominatorTree *DT,
+                            bool ExcludePredicate, const GenXSubtarget* Subtarget) {
+  bool Modified = false;
+  std::set<Instruction *> Done;
+  for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+    BasicBlock *BB = &*fi;
+    for (auto bi = BB->begin(); ; ++bi) {
+      auto Phi = dyn_cast<PHINode>(&*bi);
+      if (!Phi)
+        break;
+      if (!Done.insert(Phi).second)
+        continue; // phi node already processed in some web
+      // Gather the web of phi nodes and two address instructions related to
+      // this one.  This is an approximation to the web of instructions that
+      // will or could be coalesced.
+      // (Use Web as a worklist of phi nodes and two address instructions to
+      // use to find other phi nodes and two address instructions.)
+      //
+      // We process a web of related phi nodes at a time, rather than all phi
+      // nodes that use the constant, to avoid this situation:
+      // we try and common up two phi nodes in the same basic block (e.g. two
+      // variables both initialized to 0 before a loop), but end up having to
+      // insert a copy for one of them anyway in coalescing.
+      SmallVector<Instruction *, 4> Web;
+      Web.push_back(Phi);
+      for (unsigned wi = 0; wi != Web.size(); ++wi) {
+        auto Inst = Web[wi];
+        unsigned oi = 0, oe = 0;
+        if ((Phi = dyn_cast<PHINode>(Inst))) {
+          // Phi node: process each incoming.
+          oe = Phi->getNumIncomingValues();
+        } else {
+          // Two address instruction: process just the two address operand.
+          oi = getTwoAddressOperandNum(cast<CallInst>(Inst));
+          oe = oi + 1;
+        }
+
+        auto IsPhiOrTwoAddress = [=](Value *V) {
+          if (isa<PHINode>(V))
+            return true;
+          if (auto CI = dyn_cast<CallInst>(V))
+            return getTwoAddressOperandNum(CI) >= 0;
+          return false;
+        };
+
+        // For each incoming:
+        for (; oi != oe; ++oi ) {
+          auto Incoming = Inst->getOperand(oi);
+          // If it is a phi node or two address instruction, push it into the
+          // web for processing later.
+          if (IsPhiOrTwoAddress(Incoming)) {
+            auto IncomingInst = cast<Instruction>(Incoming);
+            if (Done.insert(IncomingInst).second)
+              Web.push_back(IncomingInst);
+          } else if (!isa<Constant>(Incoming)) {
+            // For any other inst or arg, see if it has any other use in a phi
+            // node or two address inst, and push that into the web.
+            for (auto ui = Incoming->use_begin(), ue = Incoming->use_end();
+                ui != ue; ++ui) {
+              auto User = cast<Instruction>(ui->getUser());
+              if (IsPhiOrTwoAddress(User))
+                if (Done.insert(User).second)
+                  Web.push_back(User);
+            }
+          }
+        }
+        // Now process each use of the result of the phi node or two address
+        // instruction. If the use is in a phi node or is a two address operand,
+        // push the user into the web.
+        for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+          auto User = cast<Instruction>(ui->getUser());
+          if (IsPhiOrTwoAddress(User))
+            if (Done.insert(User).second)
+              Web.push_back(User);
+        }
+      }
+      LLVM_DEBUG(
+        dbgs() << "loadPhiConstants: Web of phi nodes and two address insts:\n";
+        for (auto wi = Web.begin(), we = Web.end(); wi != we; ++wi)
+          dbgs() << **wi << "\n"
+      );
+      // Now process the web, ignoring anything other than phi nodes.
+      // Gather the distinct constants, and every use for each one in a phi
+      // node.
+      std::map<Constant *, SmallVector<Use *, 4>> ConstantUses;
+      SmallVector<Constant *, 8> DistinctConstants;
+      for (unsigned wi = 0, we = Web.size(); wi != we; ++wi) {
+        auto Phi = dyn_cast<PHINode>(Web[wi]);
+        if (!Phi)
+          continue;
+        for (unsigned oi = 0, oe = Phi->getNumIncomingValues(); oi != oe; ++oi) {
+          Use *U = &Phi->getOperandUse(oi);
+          auto *C = dyn_cast<Constant>(*U);
+          if (!C || isa<UndefValue>(C))
+            continue;
+          // when doing this transform in pattern matching phase
+          if (ExcludePredicate) {
+            if (C->getType()->getScalarType()->isIntegerTy(1))
+              continue;
+            if (C->getType()->getPrimitiveSizeInBits() <= 256)
+              continue;
+            auto IncomingBlock = Phi->getIncomingBlock(oi);
+            if (GotoJoin::isBranchingJoinLabelBlock(IncomingBlock))
+              continue;
+          }
+
+          auto Entry = &ConstantUses[C];
+          if (!Entry->size())
+            DistinctConstants.push_back(C);
+          Entry->push_back(U);
+        }
+      }
+      // Handle each distinct constant.
+      for (unsigned dci = 0, dce = DistinctConstants.size(); dci != dce; ++dci) {
+        Constant *C = DistinctConstants[dci];
+        auto Entry = &ConstantUses[C];
+        if (Entry->size() != 1) {
+          LLVM_DEBUG(
+            dbgs() << "multiple use of " << *C << "\n";
+            for (unsigned ei = 0, ee = Entry->size(); ei != ee; ++ei)
+              dbgs() << *(*Entry)[ei]->getUser() << "\n"
+          );
+        }
+        // Find the closest common dominator of the incoming blocks of all phi
+        // uses of the constant. That is where we want to insert the constant
+        // load.
+        Use *U = (*Entry)[0];
+        auto InsertBB = cast<PHINode>(U->getUser())
+            ->getIncomingBlock(U->getOperandNo());
+        for (unsigned ei = 1, ee = Entry->size(); ei != ee; ++ei) {
+          U = (*Entry)[ei];
+          auto Phi = cast<PHINode>(U->getUser());
+          auto IncomingBB = Phi->getIncomingBlock(U->getOperandNo());
+          InsertBB = DT->findNearestCommonDominator(InsertBB, IncomingBB);
+        }
+        // If that location is an empty split critical edge block, go up to its
+        // predecessor (which is also its immediate dominator) if this block is
+        // "true" successor of branching simd cf block. In this case we cannot
+        // insert anything in current block and have to create partial
+        // redundancy.
+        assert(InsertBB);
+        auto *InsertTerm = InsertBB->getTerminator();
+        auto *SinglePred = InsertBB->getSinglePredecessor();
+        if (InsertTerm->getNumSuccessors() == 1 &&
+            InsertTerm == &InsertBB->front() && SinglePred &&
+            GotoJoin::isBranchingGotoJoinBlock(SinglePred))
+          InsertBB = SinglePred;
+
+        // Insert the constant load.
+        ConstantLoader CL(C, Subtarget);
+        Value *Load = nullptr;
+        Instruction *InsertBefore = InsertBB->getTerminator();
+        if (!CL.isSimple())
+          Load = CL.loadNonSimple(InsertBefore);
+        else
+          Load = CL.load(InsertBefore);
+        Modified = true;
+        // Modify the uses.
+        for (unsigned ei = 0, ee = Entry->size(); ei != ee; ++ei)
+          *(*Entry)[ei] = Load;
+        // replace other non-phi uses that are also dominated by the InsertBB
+        for (unsigned wi = 0, we = Web.size(); wi != we; ++wi) {
+          if (isa<PHINode>(Web[wi]))
+            continue;
+          auto CI = dyn_cast<CallInst>(Web[wi]);
+          if (CI && getTwoAddressOperandNum(CI) >= 0) {
+            auto oi = getTwoAddressOperandNum(CI);
+            Use *U = &CI->getOperandUse(oi);
+            auto *UC = dyn_cast<Constant>(*U);
+            if (UC && UC == C) {
+              if (CI->getParent() != InsertBB && DT->dominates(InsertBB, CI->getParent()))
+                *U = Load;
+            }
+          }
+        }
+      }
+    }
+  }
+  return Modified;
+}
+
+void ConstantLoader::fixSimple(int OperandIdx) {
+  assert(NewC &&
+     "no need to fix simple case");
+  assert(User->getOperand(OperandIdx) == C &&
+      "wrong arguments: wrong operand index was provided");
+  User->setOperand(OperandIdx, NewC);
+  C = NewC;
+  // indicate that we no longer need fix
+  NewC = nullptr;
+}
+
+/***********************************************************************
+ * ConstantLoader::loadNonSimple : load a non-simple constant
+ *
+ * Enter:   C = constant to lower if necessary
+ *          Inst = instruction it is used in (also used to insert new
+ *                 code before)
+ *
+ * Return:  new instruction
+ */
+Instruction *ConstantLoader::loadNonSimple(Instruction *Inst)
+{
+  assert(!isSimple());
+  if (!isLegalSize())
+    return loadBig(Inst);
+  if (PackedFloat) {
+    unsigned NumElts = C->getType()->getVectorNumElements();
+    SmallVector<Instruction *, 4> Quads;
+    for (unsigned i = 0, e = NumElts; i != e; i += 4) {
+      SmallVector<Constant *, 4> Quad;
+      for (unsigned j = 0; j != 4 && (i + j) < NumElts; ++j)
+        Quad.push_back(C->getAggregateElement(i + j));
+      ConstantLoader Packed(ConstantVector::get(Quad));
+      Quads.push_back(Packed.load(Inst));
+    }
+    Value *V = UndefValue::get(C->getType());
+    unsigned Offset = 0;
+    auto DL = Inst->getDebugLoc();
+    for (auto &Q : Quads) {
+      VectorType *VTy = cast<VectorType>(Q->getType());
+      Region R(V);
+      R.getSubregion(Offset, VTy->getNumElements());
+      V = R.createWrRegion(V, Q, "constant.quad" + Twine(Offset), Inst, DL);
+      Offset += VTy->getNumElements();
+    }
+    return cast<Instruction>(V);
+  }
+  if (PackedIntScale) {
+    auto PackTy = C->getType()->getScalarType();
+	// limit the constant-type to 32-bit because we do not want 64-bit operation
+    if (PackTy->getPrimitiveSizeInBits() > 32)
+      PackTy = Type::getInt32Ty(Inst->getContext());
+    // Load as a packed int vector with scale and/or adjust.
+    SmallVector<Constant *, 8> PackedVals;
+    for (unsigned i = 0, e = C->getType()->getVectorNumElements();
+        i != e; ++i) {
+      int64_t Val = 0;
+      if (auto CI = dyn_cast<ConstantInt>(C->getAggregateElement(i))) {
+        Val = CI->getSExtValue();
+        Val -= PackedIntAdjust;
+        Val /= PackedIntScale;
+      }
+      PackedVals.push_back(ConstantInt::get(PackTy, Val, /*isSigned=*/true));
+      assert(cast<ConstantInt>(PackedVals.back())->getSExtValue() >= -8
+          && cast<ConstantInt>(PackedVals.back())->getSExtValue() <= 15);
+    }
+    ConstantLoader Packed(ConstantVector::get(PackedVals));
+    auto LoadPacked = Packed.load(Inst);
+    if (PackedIntScale != 1)
+      LoadPacked = BinaryOperator::Create(Instruction::Mul, LoadPacked,
+          ConstantVector::getSplat(C->getType()->getVectorNumElements(),
+            ConstantInt::get(PackTy, PackedIntScale,
+            /*isSigned=*/true)), "constantscale", Inst);
+    if (PackedIntAdjust)
+      LoadPacked = BinaryOperator::Create(Instruction::Add, LoadPacked,
+          ConstantVector::getSplat(C->getType()->getVectorNumElements(),
+            ConstantInt::get(PackTy, PackedIntAdjust,
+            /*isSigned=*/true)), "constantadjust", Inst);
+    if (PackTy->getPrimitiveSizeInBits() < 
+		C->getType()->getScalarType()->getPrimitiveSizeInBits()) {
+      LoadPacked = CastInst::CreateSExtOrBitCast(
+		  LoadPacked, C->getType(), "constantzext", Inst);
+    }
+    return LoadPacked;
+  }
+  if (auto CC = getConsolidatedConstant(C)) {
+    // We're loading a vector of byte or short (but not i1). Use int so the
+    // instruction does not use so many channels. This may also save it being
+    // split by legalization.
+    ConstantLoader CCL(CC, Subtarget);
+    Instruction *NewInst = nullptr;
+    if (CCL.isSimple())
+      NewInst = CCL.load(Inst);
+    else
+      NewInst = CCL.loadNonSimple(Inst);
+    NewInst = CastInst::Create(Instruction::BitCast, NewInst, C->getType(),
+        "constant", Inst);
+    if (AddedInstructions)
+      AddedInstructions->push_back(NewInst);
+    return NewInst;
+  }
+  VectorType *VT = dyn_cast<VectorType>(C->getType());
+  unsigned NumElements = VT->getNumElements();
+  SmallVector<Constant *, 32> Elements;
+  unsigned UndefBits = 0;
+  if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C)) {
+    // Gather the elements.
+    for (unsigned i = 0; i != NumElements; ++i) {
+      Constant *El = CDV->getElementAsConstant(i);
+      assert(!isa<UndefValue>(El) && "CDV element can't be undef");
+      Elements.push_back(El);
+    }
+  } else {
+    ConstantVector *CV = cast<ConstantVector>(C);
+    // Gather the elements.
+    for (unsigned i = 0; i != NumElements; ++i) {
+      Constant *El = CV->getOperand(i);
+      if (isa<UndefValue>(El))
+        UndefBits |= 1 << i;
+      Elements.push_back(El);
+    }
+  }
+  unsigned RemainingBits = ~UndefBits
+      & ((NumElements == 32 ? 0 : 1 << NumElements) - 1);
+  if (!RemainingBits) {
+    // All elements are undef. This should have been simplified away earlier,
+    // but we need to cope with it in case it was not. Just load the first
+    // element.
+    RemainingBits = 1;
+  }
+  Instruction *Result = 0;
+  // If it is wider than 8 elements, see if we can load any group of 8 as a
+  // packed vector.
+  if (NumElements > 8) {
+    for (unsigned Idx = 0; Idx < NumElements - 4; Idx += 8) {
+      unsigned Size = std::min(8U, NumElements - Idx);
+      Constant *SubC = getConstantSubvector(C, Idx, Size);
+      if (isa<UndefValue>(SubC))
+        continue;
+      ConstantLoader SubLoader(SubC, Subtarget);
+      if (SubLoader.PackedIntScale == 0 && !SubLoader.isPackedFloatVector())
+        continue;
+      Region R(C);
+      R.getSubregion(Idx, Size);
+      if (SubLoader.isSimple()) {
+        Value *SubV = SubC;
+        Result = cast<Instruction>(R.createWrConstRegion(
+            Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()),
+            SubV, "constant.split" + Twine(Idx),
+            Inst, Inst->getDebugLoc()));
+      } else {
+        Value* SubV = SubLoader.loadNonSimple(Inst);
+        Result = cast<Instruction>(R.createWrRegion(
+            Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()),
+            SubV, "constant.split" + Twine(Idx),
+            Inst, Inst->getDebugLoc()));
+      }
+      if (AddedInstructions)
+        AddedInstructions->push_back(Result);
+      RemainingBits &= ~(255 << Idx);
+    }
+    if (!RemainingBits)
+      return Result;
+  }
+
+  // Build the splat sets, that is, the sets of elements of identical value.
+  SmallVector<unsigned, 32> SplatSets;
+  {
+    ValueMap<Constant *, unsigned> SplatSetFinder;
+    for (unsigned i = 0; i != NumElements; ++i) {
+      Constant *El = Elements[i];
+      if (!isa<UndefValue>(El)) {
+        std::pair<ValueMap<Constant *, unsigned>::iterator, bool> Created
+            = SplatSetFinder.insert(std::pair<Constant *, unsigned>(El,
+                  SplatSets.size()));
+        if (Created.second) {
+          // First time this Constant has been seen.
+          SplatSets.push_back(1 << i);
+        } else {
+          // Add on to existing splat set.
+          SplatSets[Created.first->second] |= 1 << i;
+        }
+      }
+    }
+  }
+  // Remove any splat set with only a single element.
+  unsigned NewSize = 0;
+  for (unsigned i = 0, e = SplatSets.size(); i != e; ++i) {
+    if (countPopulation(SplatSets[i]) >= 2)
+      SplatSets[NewSize++] = SplatSets[i];
+  }
+  SplatSets.resize(NewSize);
+  // Determine which elements are suitable for inclusion in a packed vector.
+  // FIXME Not implemented yet. For an int vector constant, we need to
+  // determine whether the instruction expects the operand to be signed
+  // or unsigned.
+
+  // Loop constructing the constant until it is complete.
+  do {
+    // Find the splat set that will contribute the most elements
+    // to the vector, taking into account what elements we can access
+    // in a 1D region write. (Initialize BestSplatSetBits so, if no best
+    // splat is found, we just do a single element out of RemainingBits.)
+    //
+    // Note that we are looking for the splat set that sets the most elements,
+    // not the one that _usefully_ sets the most elements. For example,
+    // Examples/sepia has a constant vector of the form
+    // < A, B, C, 0, 0, A, B, C >
+    // We have four splat sets {0,5} {1,6} {2,7} {3,4}, each of which
+    // has two elements. What we want to do is set one of the A, B or C
+    // sets first, rather than the 0s, because region restrictions mean that
+    // we can only set such a pair if we do it first. If the loop below were
+    // to find the splat set that _usefully_ sets the most elements, all four
+    // sets would say "2" and we would arbitrarily pick one of them. But, if
+    // we ask each splat set how many elements it sets, even uselessly, then
+    // the A, B and C sets say "8" and the 0 set says "2", and we ensure that
+    // we do one of the A, B or C sets first.
+    // So we end up setting the constant in this order (arbitrarily picking
+    // A first):
+    //     < A, A, A, A, A, A, A, A >
+    //     <          0, 0          >
+    //     <    B                   >
+    //     <                   B    >
+    //     <       C                >
+    //     <                      C >
+    // giving five wrregion instructions rather than six.
+    unsigned BestSplatSetBits = 1 << genx::log2(RemainingBits);
+    unsigned BestSplatSetUsefulBits = BestSplatSetBits;
+    unsigned BestSplatSetCount = 1;
+    Constant *BestSplatSetConst = Elements[genx::log2(RemainingBits)];
+    for (unsigned i = 0, e = SplatSets.size(); i != e; ++i) {
+      unsigned Bits = getRegionBits(SplatSets[i] & RemainingBits,
+          SplatSets[i] | RemainingBits | UndefBits, NumElements);
+      unsigned Count = countPopulation(Bits);
+      // For this splat set, Bits is a bitmap of the vector elements that
+      // we can set in this splat set in a legal 1D region (possibly including
+      // elements already set and undef elements), and Count is how many
+      // elements that still need setting the region will set.
+      if (Count > BestSplatSetCount) {
+        BestSplatSetBits = Bits;
+        BestSplatSetUsefulBits = Bits & SplatSets[i];
+        BestSplatSetCount = Count;
+        BestSplatSetConst = Elements[genx::log2(SplatSets[i])];
+      }
+    }
+    // Now BestSplatSetBits is a bitmap of the vector elements to include in
+    // the best splat. Set up the splatted constant.
+    if (!Result) {
+      // For the first time round the loop, just splat the whole vector,
+      // whatever BestSplatBits says.
+      Result = loadConstant(ConstantVector::getSplat(
+            NumElements, BestSplatSetConst), Inst, AddedInstructions);
+      Result->setDebugLoc(Inst->getDebugLoc());
+    } else {
+      // Not the first time round the loop. Set up the splatted subvector,
+      // and write it as a region.
+      Region R(BestSplatSetBits,
+          VT->getElementType()->getPrimitiveSizeInBits() / 8);
+      Constant *NewConst = ConstantVector::getSplat(R.NumElements,
+          BestSplatSetConst);
+      Result = cast<Instruction>(R.createWrConstRegion(Result, NewConst, "constant",
+            Inst, Inst->getDebugLoc()));
+      if (AddedInstructions)
+        AddedInstructions->push_back(Result);
+    }
+    RemainingBits &= ~BestSplatSetUsefulBits;
+  } while (RemainingBits);
+  return Result;
+}
+
+/***********************************************************************
+ * getRegionBits : determine which vector elements we can set with a
+ *                 1D region
+ *
+ * Enter:   NeededBits = bits for vector elements we need to set
+ *          OptionalBits = bits for vector elements we could set
+ *          VecWidth = number of elements in vector
+ *
+ * Return:  bits for vector elements to set as a legal 1D region,
+ *          maximizing how many of NeededBits are set
+ */
+unsigned ConstantLoader::getRegionBits(unsigned NeededBits,
+    unsigned OptionalBits, unsigned VecWidth)
+{
+  if (!NeededBits)
+    return 0;
+  // Get the first and last element numbers in NeededBits.
+  unsigned FirstNeeded = countTrailingZeros(NeededBits, ZB_Undefined);
+  unsigned LastNeeded = 31 - countLeadingZeros((uint32_t)NeededBits, ZB_Undefined);
+  // Set the max width to the min size including both those elements
+  // rounded up to the next power of two.
+  unsigned MaxWidth = LastNeeded - FirstNeeded + 1;
+  unsigned LogMaxWidth = genx::log2(MaxWidth);
+  if (MaxWidth != 1U << LogMaxWidth) {
+    ++LogMaxWidth;
+    MaxWidth = 1U << LogMaxWidth;
+  }
+  // Special case NeededBits only having one element.
+  if (LogMaxWidth == 0)
+    return NeededBits;
+  // Now find the best region.
+  unsigned BestBits = 0;
+  unsigned BestCount = 0;
+  // Try each stride.
+  static const unsigned StrideBitsTable[] = { 0xffffffffU, 0x55555555U, 0x11111111U };
+  for (unsigned LogStride = 0, Stride = 1;
+      LogStride <= 2U && LogStride < LogMaxWidth;
+      ++LogStride, Stride <<= 1U) {
+    // Try each width (not including 1).
+    for (unsigned Width = 1U << (LogMaxWidth - LogStride); Width > 1; Width >>= 1) {
+      if (Width <= BestCount)
+        break;
+      // Try each start index.
+      for (unsigned Idx = 0; Idx + (Width - 1) * Stride < VecWidth; ++Idx) {
+        if (Idx + Width > VecWidth)
+          break;
+        // Calculate which indexes the region will set.
+        unsigned Bits = StrideBitsTable[LogStride];
+        if (Width != 32)
+          Bits &= (1 << Width) - 1;
+        Bits <<= Idx;
+        // See if it sets any elements that we are not allowed to set.
+        if (Bits & ~(NeededBits | OptionalBits))
+          continue;
+        // See if it sets all of NeededBits.
+        if ((Bits & NeededBits) == NeededBits)
+          return Bits;
+        // See if it is the best one we have seen so far.
+        unsigned Count = countPopulation(Bits & NeededBits);
+        if (Count > BestCount) {
+          BestCount = Count;
+          BestBits = Bits;
+          if (BestCount == Width)
+            break;
+        }
+      }
+    }
+  }
+  if (!BestCount) {
+    // We could not find any region that includes more than one of NeededBits.
+    // Just do a single element.
+    return 1 << genx::log2(NeededBits);
+  }
+  return BestBits;
+}
+
+Instruction *ConstantLoader::loadSplatConstant(Instruction *InsertPos) {
+  // Skip scalar types, vector type with just one element, or boolean vector.
+  VectorType *VTy = dyn_cast<VectorType>(C->getType());
+  if (!VTy ||
+      VTy->getNumElements() == 1 ||
+      VTy->getScalarType()->isIntegerTy(1))
+    return nullptr;
+  // Skip non-splat vector.
+  Constant *C1 = C->getSplatValue();
+  if (!C1)
+    return nullptr;
+  // Create <1 x T> constant and broadcast it through rdregion.
+  Constant *CV = ConstantVector::get(C1);
+  // Load that scalar constant first.
+  ConstantLoader L(CV, Subtarget);
+  Value *V = L.load(InsertPos);
+  // Broadcast through rdregion.
+  Region R(V);
+  R.Width = R.NumElements = VTy->getNumElements();
+  R.Stride = 0;
+  R.VStride = 0;
+  R.Offset = 0;
+  Instruction *NewInst = R.createRdRegion(V, ".constsplat", InsertPos, DebugLoc());
+  if (AddedInstructions)
+    AddedInstructions->push_back(NewInst);
+  return NewInst;
+}
+
+/***********************************************************************
+ * ConstantLoader::load : insert instruction to load a constant
+ *
+ * We use llvm.genx.constant, rather than bitcast, because CSE has a habit
+ * of propagating a constant bitcast back into our operand that is not
+ * allowed to be constant.
+ *
+ * Enter:   C = constant to load
+ *          InsertBefore = insert new instruction before here
+ *
+ * Return:  new instruction
+ */
+Instruction *ConstantLoader::load(Instruction *InsertBefore)
+{
+  assert(isSimple());
+  // Do not splat load on byte data as HW does not support byte imm source.
+  if (!C->getType()->getScalarType()->isIntegerTy(8))
+    if (auto NewInst = loadSplatConstant(InsertBefore))
+      return NewInst;
+
+  if (!PackedFloat && !PackedIntScale && !isa<UndefValue>(C)) { // not packed int constant or undef
+    if (auto CC = getConsolidatedConstant(C)) {
+      // We're loading a vector of byte or short (but not i1). Use int so the
+      // instruction does not use so many channels. This may also save it being
+      // split by legalization.
+      Instruction *NewInst = loadConstant(CC, InsertBefore, AddedInstructions);
+      NewInst = CastInst::Create(Instruction::BitCast, NewInst, C->getType(),
+          "constant", InsertBefore);
+      if (AddedInstructions)
+        AddedInstructions->push_back(NewInst);
+      return NewInst;
+    }
+  }
+
+  // Load the constant as normal.
+  Value *Args[] = { C };   // Args to new llvm.genx.constant
+  Type *OverloadedTypes[] = { C->getType() };
+  GenXIntrinsic::ID IntrinsicID = GenXIntrinsic::genx_constanti;
+  if (C->getType()->isFPOrFPVectorTy())
+    IntrinsicID = GenXIntrinsic::genx_constantf;
+  else if (C->getType()->getScalarType()->isIntegerTy(1))
+    IntrinsicID = GenXIntrinsic::genx_constantpred;
+  Module *M = InsertBefore->getParent()->getParent()->getParent();
+  Function *Decl = GenXIntrinsic::getGenXDeclaration(M, IntrinsicID, OverloadedTypes);
+  Instruction *NewInst = CallInst::Create(Decl, Args, "constant", InsertBefore);
+  if (AddedInstructions)
+    AddedInstructions->push_back(NewInst);
+  return NewInst;
+}
+
+/***********************************************************************
+ * ConstantLoader::loadBig : insert instruction to load a constant that might
+ *      be illegally sized
+ */
+Instruction *ConstantLoader::loadBig(Instruction *InsertBefore)
+{
+  if (isLegalSize() || isa<UndefValue>(C)) {
+    // Does not need legalizing.
+    if (!isSimple())
+      return loadNonSimple(InsertBefore);
+    return load(InsertBefore);
+  }
+  assert(!C->getType()->getScalarType()->isIntegerTy(1) && "not expecting predicate in here");
+  if (Constant *Consolidated = getConsolidatedConstant(C)) {
+    // Load as a consolidated constant, then bitcast to the correct type.
+    auto Load = ConstantLoader(Consolidated, nullptr, AddedInstructions, Subtarget)
+          .loadBig(InsertBefore);
+    assert(Load);
+    Load = CastInst::Create(Instruction::BitCast, Load, C->getType(),
+        Load->getName() + ".cast", InsertBefore);
+    if (AddedInstructions)
+      AddedInstructions->push_back(Load);
+    return Load;
+  }
+  auto VT = cast<VectorType>(C->getType());
+  unsigned NumElements = VT->getNumElements();
+  unsigned LogElementBits = genx::log2(
+      VT->getElementType()->getPrimitiveSizeInBits());
+  unsigned MaxSize = 1 << (9/*log 2xGRFsize*/ - LogElementBits);
+  MaxSize = std::min(MaxSize, 32U);
+  Instruction *Result = nullptr;
+  for (unsigned Idx = 0; Idx != NumElements; ) {
+    unsigned Size = std::min(1U << genx::log2(NumElements - Idx), MaxSize);
+    // Load this subvector constant if necessary, and insert into the overall
+    // value with wrregion.
+    Constant *SubC = getConstantSubvector(C, Idx, Size);
+    Value *SubV = SubC;
+    ConstantLoader SubLoader(SubC, Subtarget);
+    if (!SubLoader.isSimple())
+      SubV = SubLoader.loadNonSimple(InsertBefore);
+    Region R(C);
+    R.getSubregion(Idx, Size);
+    Result = cast<Instruction>(R.createWrRegion(
+        Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()),
+        SubV, "constant.split" + Twine(Idx),
+        InsertBefore, DebugLoc()));
+    if (AddedInstructions)
+      AddedInstructions->push_back(Result);
+    Idx += Size;
+  }
+  return Result;
+}
+
+/***********************************************************************
+ * ConstantLoader::isLegalSize : detect if a constant is a legal size
+ */
+bool ConstantLoader::isLegalSize()
+{
+  auto VT = dyn_cast<VectorType>(C->getType());
+  if (!VT)
+    return true;
+  int NumBits = C->getType()->getPrimitiveSizeInBits();
+  if (!llvm::isPowerOf2_32(NumBits))
+    return false;
+  int GRFSize = 32;
+  if (Subtarget)
+      GRFSize = Subtarget->getGRFWidth();
+  if (NumBits > GRFSize * 8 /*bytes*/ * 2)
+    return false; // bigger than 2 GRFs
+  if (VT->getNumElements() > 32)
+    return false; // 64 bytes not allowed
+  return true;
+}
+
+/***********************************************************************
+ * ConstantLoader::isBigSimple : detect if a constant is either simple,
+ *    or would be simple after being split into legal sizes
+ *
+ * This does not do a thorough check so it misses some cases of a constant
+ * that would split into simple constants.
+ */
+bool ConstantLoader::isBigSimple()
+{
+  assert(!needFixingSimple() &&
+      "simple case shall be fixed first before this call");
+  if (isa<UndefValue>(C))
+    return true; // undef is simple
+  auto VT = dyn_cast<VectorType>(C->getType());
+  if (!VT)
+    return true; // scalar always simple
+  if (C->getSplatValue())
+    return true; // splat constant always simple
+  if (VT->getElementType()->getPrimitiveSizeInBits() == 1)
+    return true; // predicate constant always simple
+  return false;
+}
+
+/***********************************************************************
+ * ConstantLoader::isSimple : detect if a constant is "simple"
+ *
+ * A simple constant is one we know can be a constant operand in an instruction.
+ */
+bool ConstantLoader::isSimple()
+{
+  assert(!needFixingSimple() &&
+      "simple case shall be fixed first before this call");
+  if (isa<UndefValue>(C))
+    return true; // undef is simple (and generates no vISA code)
+  if (C->getType()->getScalarType()->isIntegerTy(1) && C->isAllOnesValue())
+    return true; // all 1s predicate is simple
+  if(User && User->isBinaryOp())
+    if (isa<VectorType>(C->getType()))
+      if (auto splat = C->getSplatValue())
+        if (splat->isZeroValue())
+          return true;
+  if (!isLegalSize())
+    return false; // Simple constant must be legally sized
+  if (isBigSimple())
+    return true; // a big simple constant that is legally sized is simple
+  if (isPackedIntVector())
+    return true;
+  if (isPackedFloatVector())
+    return true;
+  return false;
+}
+
+/***********************************************************************
+ * ConstantLoader::isPackedIntVector : check for a packed int vector
+ *    (having already done the analysis in the ConstantLoader constructor)
+ */
+bool ConstantLoader::isPackedIntVector()
+{
+  // Check for a packed int vector. Either the element type must be i16, or
+  // the user (instruction using the constant) must be genx.constanti or
+  // wrregion or wrconstregion. Not allowed if the user is a logic op.
+  if (PackedIntScale == 1 && (PackedIntAdjust == 0 || PackedIntAdjust == -8)) {
+    if (!User)
+      return true; // user not specified -- assume it is a mov, so wrong element
+                   //  size is allowed
+    if (!C->getType()->getScalarType()->isIntegerTy(16)
+        && GenXIntrinsic::getGenXIntrinsicID(User) != GenXIntrinsic::genx_constanti
+        && !GenXIntrinsic::isWrRegion(User))
+      return false; // wrong element size when it is not a mov
+    switch (User->getOpcode()) {
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+        return false; // disallow packed vector in logic op
+      default:
+        break;
+    }
+    return true;
+  }
+  return false;
+}
+
+/***********************************************************************
+ * ConstantLoader::isPackedFloatVector : check for a packed float vector
+ *    (having already done the analysis in the ConstantLoader constructor)
+ */
+bool ConstantLoader::isPackedFloatVector() {
+  VectorType *VT = dyn_cast<VectorType>(C->getType());
+  if (!VT)
+    return false;
+  if (VT->getNumElements() > 4)
+    return false;
+  return PackedFloat;
+}
+
+/***********************************************************************
+ * ConstantLoader::getConsolidatedConstant : get the consolidated constant
+ *        for the given constant
+ *
+ * A "consolidated constant" is one where a vector of byte or short is
+ * turned into the equivalent (as if by bitcast) vector of int.
+ */
+Constant *ConstantLoader::getConsolidatedConstant(Constant *C)
+{
+  if (isa<UndefValue>(C))
+    return nullptr;
+  VectorType *VT = dyn_cast<VectorType>(C->getType());
+  if (!VT)
+    return nullptr;
+  unsigned BytesPerElement = VT->getElementType()->getPrimitiveSizeInBits() / 8;
+  unsigned NumElements = VT->getNumElements();
+  if (!BytesPerElement)
+    return nullptr; // vector of i1
+  if (BytesPerElement >= 4)
+    return nullptr; // already vector of i32/i64/float/double
+  if (NumElements * BytesPerElement & 3)
+    return nullptr; // not a multiple of 4 bytes long
+  // We're loading a vector of byte or short (but not i1). Use int so the
+  // instruction does not use so many channels. This may also save it being
+  // split by legalization.
+  unsigned Compaction = BytesPerElement == 1 ? 4 : 2;
+  unsigned Mask = BytesPerElement == 1 ? 0xff : 0xffff;
+  SmallVector<Constant *, 8> Elements;
+  Type *I32Ty = Type::getInt32Ty(C->getContext());
+  for (unsigned i = 0; i != NumElements; i += Compaction) {
+    unsigned Val = 0;
+    bool IsUndef = true;
+    for (unsigned j = 0; j != Compaction; ++j) {
+      unsigned Bits = 0;
+      Constant *El = C->getAggregateElement(i + j);
+      // We assume that anything that is not ConstantInt is undefined. That
+      // can include a constant expression with an undefined value in the
+      // middle.
+      if (auto CI = dyn_cast<ConstantInt>(El)) {
+        Bits = CI->getSExtValue();
+        IsUndef = false;
+      }
+      else if (auto CI = dyn_cast<ConstantFP>(El)) {
+        APFloat V = CI->getValueAPF();
+        Bits = V.bitcastToAPInt().getZExtValue();
+        IsUndef = false;
+      }
+      Val |= (Bits & Mask) << (j * BytesPerElement * 8);
+    }
+    if (IsUndef)
+      Elements.push_back(UndefValue::get(I32Ty));
+    else
+      Elements.push_back(ConstantInt::get(I32Ty, Val));
+  }
+  // Construct the constant with i32 element type.
+  return ConstantVector::get(Elements);
+}
+
+/***********************************************************************
+ * ConstantLoader::analyze : analyze a constant value
+ *
+ * This analyzes whether a constant of no more than the right vector width
+ * (integer 8 or fp 4) can be loaded as a packed vector, possibly scaled
+ * and adjusted.
+ */
+void ConstantLoader::analyze()
+{
+  auto VT = dyn_cast<VectorType>(C->getType());
+  if (!VT)
+    return;
+  if (C->getSplatValue())
+    return; // don't analyze if already a splat
+  unsigned NumElements = VT->getNumElements();
+  if (NumElements <= 8 && VT->getElementType()->isIntegerTy())
+    analyzeForPackedInt(NumElements);
+  else if (NumElements <= 8 && VT->getElementType()->isFloatingPointTy())
+    analyzeForPackedFloat(NumElements);
+}
+
+void ConstantLoader::analyzeForPackedInt(unsigned NumElements)
+{
+  // Get element values.
+  int64_t Min = INT64_MAX;
+  int64_t Max = INT64_MIN;
+  SmallVector<int64_t, 8> Elements;
+  Constant *SomeDefinedElement = nullptr;
+  for (unsigned i = 0; i != NumElements; ++i) {
+    auto El = C->getAggregateElement(i);
+    if (isa<UndefValue>(El))
+      continue;
+    SomeDefinedElement = El;
+    int64_t Element = cast<ConstantInt>(El)->getSExtValue();
+    Elements.push_back(Element);
+    Min = std::min(Min, Element);
+    Max = std::max(Max, Element);
+  }
+  if (Elements.empty()) {
+    // Constant is undef.
+    assert(C == UndefValue::get(C->getType()) &&
+           "constant consists only of undef elements only if it's undef itself");
+    return;
+  }
+  if (Elements.size() == 1) {
+    // All but one element undef. Turn into a splat constant.
+    NewC = ConstantVector::getSplat(NumElements, SomeDefinedElement);
+    return;
+  }
+  if (Max - Min <= ImmIntVec::MaxUInt) {
+    if (Min >= ImmIntVec::MinUInt && Max <= ImmIntVec::MaxUInt) {
+      // Values all in the range [MinUInt..MaxUInt]. We can do this with a packed
+      // unsigned int with no extra scaling or adjustment.
+      PackedIntScale = 1;
+      PackedIntAdjust = 0;
+      PackedIntMax = Max;
+      return;
+    }
+    if (Min >= ImmIntVec::MinSInt && Max <= ImmIntVec::MaxSInt) {
+      // Values all in the range [MinSInt..MaxSInt]. We can do this with a packed
+      // unsigned int with no extra scaling or adjustment.
+      PackedIntScale = 1;
+      PackedIntAdjust = -8;
+      PackedIntMax = Max + 8;
+      return;
+    }
+    // Values all in the range [Min..Min+MaxUInt]. We can do this
+    // with a packed int with an adjustment.
+    PackedIntScale = 1;
+    PackedIntAdjust = Min;
+    PackedIntMax = Max - Min;
+    return;
+  }
+  // Get unique absolute differences, so we can detect if we have a valid
+  // packed int vector that is then scaled and has a splatted constant
+  // added/subtracted.
+  SmallVector<unsigned, 7> Diffs;
+  SmallSet<unsigned, 7> DiffsSet;
+  for (unsigned i = 0, e = Elements.size() - 1; i != e; ++i) {
+    Min = std::min(Min, Elements[i + 1]);
+    Max = std::max(Max, Elements[i + 1]);
+    int64_t Diff = Elements[i + 1] - Elements[i];
+    if (!Diff)
+      continue;
+    if (Diff < 0)
+      Diff = -Diff;
+    if (Diff > UINT_MAX)
+      return;
+    if (DiffsSet.insert((unsigned)Diff).second)
+      Diffs.push_back((unsigned)Diff);
+  }
+  assert(!Diffs.empty() && "not expecting splatted constant");
+  // Calculate the GCD (greatest common divisor) of the diffs using the binary
+  // GCD algorithm http://en.wikipedia.org/wiki/Binary_GCD_algorithm
+  unsigned GCD = Diffs[0];
+  if (Diffs.size() > 1) {
+    // Remove factors of 2.
+    unsigned MaxPowerOfTwo = 31;
+    for (unsigned i = 0, e = Diffs.size(); i != e; ++i)
+      MaxPowerOfTwo = std::min(MaxPowerOfTwo,
+          (unsigned)countTrailingZeros(Diffs[i], ZB_Undefined));
+    if (MaxPowerOfTwo)
+      for (unsigned i = 0, e = Diffs.size(); i != e; ++i)
+        Diffs[i] >>= MaxPowerOfTwo;
+    // Apply the rest of the binary GCD algorithm to Diffs[0] and Diffs[1]
+    // first, then to the (not yet scaled by the power of two) GCD so far
+    // and each other element of Diffs in turn.
+    unsigned V = Diffs[0];
+    for (unsigned i = 1, e = Diffs.size(); i != e; ++i) {
+      unsigned U = Diffs[i];
+      for (;;) {
+        while (!(U & 1))
+          U >>= 1;
+        while (!(V & 1))
+          V >>= 1;
+        if (U == V)
+          break;
+        if (U < V)
+          std::swap(U, V); // make U >= V
+        U = (U - V) / 2;
+      }
+    }
+    // Scale the resulting GCD by the common power of two.
+    GCD = V << MaxPowerOfTwo;
+  }
+  if ((Max - Min) > GCD * ImmIntVec::MaxUInt)
+    return; // range of values too big.
+  PackedIntScale = GCD;
+  PackedIntMax = ImmIntVec::MaxUInt;
+  // Special case adjust of 0 or -8 as then we can save doing an adjust at all
+  // by using unsigned or signed packed vector respectively.
+  if (!(Min % GCD)) {
+    if (Min >= ImmIntVec::MinUInt && Max <= GCD * ImmIntVec::MaxUInt) {
+      PackedIntAdjust = ImmIntVec::MinUInt;
+      return;
+    }
+    if (Min >= ImmIntVec::MinSInt * GCD && Max <= ImmIntVec::MaxSInt * GCD) {
+      PackedIntAdjust = Min;
+      PackedIntMax = ImmIntVec::MaxSInt;
+      return;
+    }
+    // Special case all pre-scaled values being in [-15,0] as we can do that
+    // by negating the scale and not needing to adjust.
+    if (Min >= -ImmIntVec::MaxUInt * GCD && Max <= -ImmIntVec::MinUInt) {
+      PackedIntAdjust = ImmIntVec::MinUInt;
+      PackedIntScale = -PackedIntScale;
+      return;
+    }
+  }
+  PackedIntAdjust = Min;
+}
+
+static bool is8bitPackedFloat(float f) {
+  union {
+    float f;
+    unsigned u;
+  } u;
+
+  u.f = f;
+  unsigned Exp = (u.u >> 23) & 0xFF;
+  unsigned Frac = u.u & 0x7FFFFF;
+  if (Exp == 0 && Frac == 0)
+    return true;
+  if (Exp < 124 || Exp > 131)
+    return false;
+  if ((Frac & 0x780000) != Frac)
+    return false;
+  Frac >>= 19;
+  if (Exp == 124 && Frac == 0)
+    return false;
+  return true;
+}
+
+void ConstantLoader::analyzeForPackedFloat(unsigned NumElements) {
+  for (unsigned i = 0; i != NumElements; ++i) {
+    auto Elt = C->getAggregateElement(i);
+    if (isa<UndefValue>(Elt))
+      continue;
+    ConstantFP *CFP = dyn_cast<ConstantFP>(Elt);
+    // Bail out if any element cannot be analyzed.
+    if (!CFP)
+      return;
+    const APFloat &FP = CFP->getValueAPF();
+    // Bail out if it's not supported.
+    // TODO: Only support single precision so far.
+    if (&FP.getSemantics() != &APFloat::IEEEsingle())
+      return;
+    // Bail out if it's not finite.
+    if (!FP.isFinite())
+      return;
+    // Check if it could be represented in 8-bit packed float.
+    if (!is8bitPackedFloat(FP.convertToFloat()))
+      return;
+  }
+  PackedFloat = true;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.h
new file mode 100644
index 000000000000..83fd5879a323
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.h
@@ -0,0 +1,135 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+#ifndef GENX_CONSTANTS_H
+#define GENX_CONSTANTS_H
+
+#include "GenXSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+namespace genx {
+
+// ConstantLoader : class to insert instruction(s) to load a constant
+class ConstantLoader {
+  Constant *C;
+  Instruction *User;
+  // NewC != nullptr signals that we should replace C with NewC in User
+  // nothing to do otherwise
+  Constant *NewC = nullptr;
+  // AddedInstructions: a vector that the caller has requested any added
+  // instructions to be pushed in to.
+  SmallVectorImpl<Instruction *> *AddedInstructions;
+  // Info from analyzing for possible packed vector constant.
+  int PackedIntScale = 0;  // amount to scale packed int vector by
+  int64_t PackedIntAdjust; // amount to adjust by, special casing 0 or -8
+                           //  when PackedIntScale is 1
+  unsigned PackedIntMax;   // max value in packed vector, used when scale is
+                           //  1 and adjust is 0 to tell whether it would fit
+                           //  in 0..7
+  bool PackedFloat = false;
+
+public:
+  // Constructor
+  // User = the instruction that uses the constant. If this is genx.constanti,
+  //        then a packed vector constant can be an isSimple() constant even
+  //        when the element type is not i16. Also used to disallow a packed
+  //        vector constant in a logic op. If User==0 then it is assumed that
+  //        a packed vector constant with an element type other than i16 is OK.
+  // AddedInstructions = vector to add new instructions to when loading a
+  //        non simple constant, so the caller can see all the newly added
+  //        instructions.
+  ConstantLoader(Constant *C, Instruction *User = nullptr,
+                 SmallVectorImpl<Instruction *> *AddedInstructions = nullptr,
+                 const GenXSubtarget *Subtarget = nullptr)
+      : C(C), User(User), AddedInstructions(AddedInstructions),
+        Subtarget(Subtarget) {
+    analyze();
+  }
+  ConstantLoader(Constant *C, const GenXSubtarget *Subtarget)
+      : ConstantLoader(C, nullptr, nullptr, Subtarget) {}
+  Instruction *load(Instruction *InsertBefore);
+  Instruction *loadBig(Instruction *InsertBefore);
+  Instruction *loadNonSimple(Instruction *InsertBefore);
+  bool needFixingSimple() const { return NewC; }
+  void fixSimple(int OperandIdx);
+  bool isBigSimple();
+  bool isSimple();
+  bool isLegalSize();
+
+private:
+  const GenXSubtarget *Subtarget;
+  bool isPackedIntVector();
+  bool isPackedFloatVector();
+  void analyze();
+  Constant *getConsolidatedConstant(Constant *C);
+  unsigned getRegionBits(unsigned NeededBits, unsigned OptionalBits,
+                         unsigned VecWidth);
+  void analyzeForPackedInt(unsigned NumElements);
+  void analyzeForPackedFloat(unsigned NumElements);
+  Instruction *loadSplatConstant(Instruction *InsertPos);
+};
+
+// Some instructions force their operands to be constants.
+// Check here if operand of instruction must be constant.
+inline bool opMustBeConstant(Instruction *I, unsigned OpNum) {
+  // Mask of shufflevector should always be constant.
+  if (isa<ShuffleVectorInst>(I))
+    return OpNum == 2;
+  return false;
+}
+
+// Load a constant using the llvm.genx.constant intrinsic.
+inline Instruction *
+loadConstant(Constant *C, Instruction *InsertBefore,
+             SmallVectorImpl<Instruction *> *AddedInstructions = nullptr) {
+  return ConstantLoader(C, nullptr, AddedInstructions).load(InsertBefore);
+}
+
+// Load non-simple constants used in an instruction.
+bool loadNonSimpleConstants(
+    Instruction *Inst,
+    SmallVectorImpl<Instruction *> *AddedInstructions = nullptr,
+    const GenXSubtarget *Subtarget = nullptr);
+
+bool loadConstantsForInlineAsm(
+    CallInst *Inst, SmallVectorImpl<Instruction *> *AddedInstructions = nullptr,
+    const GenXSubtarget *Subtarget = nullptr);
+
+// Load constants used in an instruction.
+bool loadConstants(Instruction *Inst, const GenXSubtarget *Subtarget = nullptr);
+
+// Load constants used in phi nodes in a function.
+bool loadPhiConstants(Function *F, DominatorTree *DT,
+                      bool ExcludePredicate = false,
+                      const GenXSubtarget *Subtarget = nullptr);
+
+} // namespace genx
+} // namespace llvm
+
+#endif // GENX_CONSTANTS_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXDeadVectorRemoval.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXDeadVectorRemoval.cpp
new file mode 100644
index 000000000000..a0309007c373
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXDeadVectorRemoval.cpp
@@ -0,0 +1,746 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXDeadVectorRemoval
+/// ---------------------
+///
+/// GenXDeadVectorRemoval is an aggressive dead code removal pass that analyzes
+/// individual elements of a vector rather than whole values.
+///
+/// As a result of this analysis, the pass can then make the two following
+/// modifications to the code:
+///
+/// 1. If all vector elements of an instruction result turn out to be unused, the
+///    instruction is removed. In fact, this pass just sets all its uses to
+///    undef, relying on the subsequent dead code removal pass to actually
+///    remove it.
+///
+/// 2. If all vector elements of the "old value" input (even a constant) of a
+///    wrregion turn out to be unused, then that input is set to undef. This
+///    covers further cases over (1) above:
+///
+///    a. the "old value" input is constant, and we want to turn it into undef
+///       to save a useless constant load;
+///
+///    b. the "old value" input is an instruction that does have elements used
+///       elsewhere, and we want to turn it into undef to detach the two webs
+///       of defs and uses from each other to reduce register pressure in
+///       between.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_DEAD_VECTOR_REMOVAL"
+
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#include <queue>
+#include <set>
+
+using namespace llvm;
+using namespace genx;
+using namespace GenXIntrinsic::GenXRegion;
+
+static cl::opt<unsigned> LimitGenXDeadVectorRemoval("limit-genx-dead-vector-removal", cl::init(UINT_MAX), cl::Hidden,
+                                      cl::desc("Limit GenX dead element removal."));
+
+namespace {
+
+// LiveBitsStorage : encapsulate how live bits for a vector value are stored
+// For 31/63 elements or fewer, the bitmap is inside the LiveBitsStorage
+// object. For 32/64 elements or more, the bitmap is separately allocated.
+class LiveBitsStorage {
+  uintptr_t V;
+public:
+  LiveBitsStorage() : V(0) {}
+  ~LiveBitsStorage() {
+    if (auto P = getExternal())
+      delete[] P;
+    V = 0;
+  }
+private:
+  // getExternal : get the external pointer, 0 if none
+  // Whether we have an external pointer is encoded in the top bit.
+  // The pointer itself is shifted down one and stored in the other bits.
+  uintptr_t *getExternal() {
+    if ((intptr_t)V >= 0)
+      return nullptr; // top bit not set, not external
+    return (uintptr_t *)(V * 2);
+  }
+  // setExternal : set the external pointer
+  void setExternal(uintptr_t *P) {
+    assert(!getExternal());
+    V = (uintptr_t)P >> 1 | (uintptr_t)1U << (sizeof(uintptr_t) * 8 - 1);
+  }
+public:
+  // setNumElements : set the number of elements to be stored in this
+  // LiveBitsStorage. Allocate external storage if necessary.
+  void setNumElements(unsigned NumElements) {
+    if (NumElements >= sizeof(uintptr_t) * 8 - 1) {
+      unsigned Size = NumElements + sizeof(uintptr_t) * 8 - 1
+            / (sizeof(uintptr_t) * 8);
+      setExternal(new uintptr_t[Size]);
+      memset(getExternal(), 0, Size * sizeof(uintptr_t));
+    }
+  }
+  // get : get the pointer to the bitmap
+  uintptr_t *get() {
+    if (auto P = getExternal())
+      return P;
+    return &V;
+  }
+};
+
+// LiveBits : encapsulate a pointer to a bitmap of element liveness and its size
+class LiveBits {
+  uintptr_t *P;
+  unsigned NumElements;
+public:
+  static const unsigned BitsPerWord = sizeof(uintptr_t) * 8;
+  LiveBits() : P(nullptr), NumElements(0) {}
+  LiveBits(LiveBitsStorage *LBS, unsigned NumElements)
+    : P(LBS->get()), NumElements(NumElements) {}
+  // getNumElements : get the number of elements in this bitmap
+  unsigned getNumElements() const { return NumElements; }
+  // get : get a bit value
+  bool get(unsigned Idx) const {
+    assert(Idx < NumElements);
+    return P[Idx / BitsPerWord] >> (Idx % BitsPerWord) & 1;
+  }
+  // isAllZero : return true if all bits zero
+  bool isAllZero() const;
+  // set : set a bit value
+  // Returns true if value changed
+  bool set(unsigned Idx, bool Val = true);
+  // copy : copy all bits from another LiveBits
+  // Returns true if value changed
+  bool copy(LiveBits Src);
+  // orBits : or all bits from another LiveBits into this one
+  // Returns true if value changed
+  bool orBits(LiveBits Src);
+  // setRange : set range of bits, returning true if any changed
+  bool setRange(unsigned Start, unsigned Len);
+  // debug print
+  void print(raw_ostream &OS) const;
+};
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const LiveBits &LB) {
+  LB.print(OS);
+  return OS;
+}
+#endif
+
+// GenXDeadVectorRemoval : dead vector element removal pass
+class GenXDeadVectorRemoval : public FunctionPass {
+  std::map<Instruction *, LiveBitsStorage> InstMap;
+  std::set<Instruction *> WorkListSet;
+  std::queue<Instruction *> WorkList;
+  std::set<Instruction *> WrRegionsWithUsedOldInput;
+  bool WorkListPhase;
+public:
+  static char ID;
+  explicit GenXDeadVectorRemoval() : FunctionPass(ID) { }
+  virtual StringRef getPassName() const { return "GenX dead vector element removal pass"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunction(Function &F);
+private:
+  void clear() {
+    InstMap.clear();
+    WorkListSet.clear();
+    assert(WorkList.empty());
+    WrRegionsWithUsedOldInput.clear();
+  }
+  bool nullOutInstructions(Function *F);
+  void processInst(Instruction *Inst);
+  void processRdRegion(Instruction *Inst, LiveBits LB);
+  void processWrRegion(Instruction *Inst, LiveBits LB);
+  void processBitCast(Instruction *Inst, LiveBits LB);
+  void processElementwise(Instruction *Inst, LiveBits LB);
+  void markWhollyLive(Value *V);
+  void addToWorkList(Instruction *Inst);
+  LiveBits getLiveBits(Instruction *Inst, bool Create = false);
+};
+
+} // end anonymous namespace
+
+
+char GenXDeadVectorRemoval::ID = 0;
+namespace llvm { void initializeGenXDeadVectorRemovalPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXDeadVectorRemoval, "GenXDeadVectorRemoval", "GenXDeadVectorRemoval", false, false)
+INITIALIZE_PASS_END(GenXDeadVectorRemoval, "GenXDeadVectorRemoval", "GenXDeadVectorRemoval", false, false)
+
+FunctionPass *llvm::createGenXDeadVectorRemovalPass()
+{
+  initializeGenXDeadVectorRemovalPass(*PassRegistry::getPassRegistry());
+  return new GenXDeadVectorRemoval();
+}
+
+void GenXDeadVectorRemoval::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.setPreservesCFG();
+}
+
+/***********************************************************************
+ * isRootInst : check if this is a "root" instruction, one that we want to
+ *    keep even if unused
+ */
+static bool isRootInst(Instruction *Inst) {
+  if (isa<ReturnInst>(Inst) || isa<BranchInst>(Inst) ||
+      Inst->isTerminator() || Inst->mayHaveSideEffects())
+    return true;
+  if (auto CI = dyn_cast<CallInst>(Inst))
+    return !CI->onlyReadsMemory();
+  return false;
+}
+
+/***********************************************************************
+ * GenXDeadVectorRemoval::runOnFunction : process one function
+ */
+bool GenXDeadVectorRemoval::runOnFunction(Function &F)
+{
+  // First scan all the code to compute the initial live set
+  WorkListPhase = false;
+  for (po_iterator<BasicBlock *> i = po_begin(&F.getEntryBlock()),
+    e = po_end(&F.getEntryBlock()); i != e; ++i) {
+    BasicBlock *BB = *i;
+    for (Instruction *Inst = BB->getTerminator(); Inst;) {
+      if (isRootInst(Inst))
+        processInst(Inst);
+      else if (WorkListSet.count(Inst)) {
+        if (!isa<PHINode>(Inst))
+          WorkListSet.erase(Inst);
+        processInst(Inst);
+      }
+      Inst = (Inst == &BB->front()) ? nullptr : Inst->getPrevNode();
+    }
+  }
+
+  WorkListPhase = true;
+  // initialize the worklist
+  for (auto Inst : WorkListSet) {
+    WorkList.push(Inst);
+  }
+  // process until the work list is empty.
+  LLVM_DEBUG(dbgs() << "GenXDeadVectorRemoval: process work list\n");
+  while (!WorkList.empty()) {
+    Instruction *Inst = WorkList.front();
+    WorkList.pop();
+    WorkListSet.erase(Inst);
+    processInst(Inst);
+  }
+  // Null out unused instructions so the subsequent dead code removal pass
+  // removes them.
+  LLVM_DEBUG(dbgs() << "GenXDeadVectorRemoval: null out instructions\n");
+  bool Modified = nullOutInstructions(&F);
+  clear();
+  return Modified;
+}
+
+/***********************************************************************
+ * nullOutInstructions : null out unused instructions so the subsequent dead
+ * code removal pass removes them
+ *
+ * For wrregion, there are two special cases:
+ * - when no elements in the "new value" input of a wrregion are use,
+ *   then bypass the wrregion with the "old value".
+ * - when no elements in the "old value" input of a wrregion are used, 
+ *   then changes the input to undef.
+ */
+bool GenXDeadVectorRemoval::nullOutInstructions(Function *F)
+{
+  static unsigned Count = 0;
+  bool Modified = false;
+  for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+    for (auto bi = fi->begin(), be = fi->end(); bi != be; ++bi) {
+      Instruction *Inst = &*bi;
+      // Ignore "root" instructions.
+      if (isRootInst(Inst))
+        continue;
+      // See if the instruction has no used elements. If so, null out its uses.
+      auto LB = getLiveBits(Inst);
+      if (LB.isAllZero()) {
+        if (++Count > LimitGenXDeadVectorRemoval)
+          return Modified;
+        if (LimitGenXDeadVectorRemoval != UINT_MAX)
+          dbgs() << "-limit-genx-dead-vector-removal " << Count << "\n";
+        LLVM_DEBUG(if (!Inst->use_empty())
+          dbgs() << "nulled out uses of " << *Inst << "\n");
+        while (!Inst->use_empty()) {
+          Use *U = &*Inst->use_begin();
+          *U = UndefValue::get((*U)->getType());
+        }
+        Modified = true;
+      } else if (GenXIntrinsic::isWrRegion(Inst)) {
+        if (!Inst->use_empty()) {
+          auto *SI = dyn_cast<StoreInst>(Inst->user_back());
+          if (SI && genx::isGlobalStore(SI)) {
+            assert(Inst->hasOneUse() &&
+                   "Wrregion in gstore bale has more than one use");
+            continue;
+          }
+        }
+        // Otherwise, for a wrregion, check if it is in the old input used set.
+        // If not, then no element of the "old value" input is used by this
+        // instruction (even if it has bits set from other uses), and we can
+        // undef out the input.
+        Use *U = &Inst->getOperandUse(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+        if (WrRegionsWithUsedOldInput.find(Inst)
+          == WrRegionsWithUsedOldInput.end()) {
+          if (!isa<UndefValue>(*U)) {
+            if (++Count > LimitGenXDeadVectorRemoval)
+              return Modified;
+            if (LimitGenXDeadVectorRemoval != UINT_MAX)
+              dbgs() << "-limit-genx-dead-vector-removal " << Count << "\n";
+            *U = UndefValue::get((*U)->getType());
+            LLVM_DEBUG(dbgs() << "null out old value input in " << *Inst << "\n");
+            Modified = true;
+          }
+        }
+        // when no elements in the "new value" input of a wrregion are use,
+        // then bypass the wrregion with the "old value".
+        bool bypass = true;
+        Region R(Inst, BaleInfo());
+        if (R.Mask || R.Indirect)
+          bypass = false;
+        else {
+          for (unsigned RowIdx = R.Offset / R.ElementBytes, Row = 0,
+            NumRows = R.NumElements / R.Width; Row != NumRows && bypass;
+            RowIdx += R.VStride, ++Row) {
+            for (unsigned Idx = RowIdx, Col = 0; Col != R.Width && bypass;
+              Idx += R.Stride, ++Col) {
+              if (Idx < LB.getNumElements() && LB.get(Idx))
+                bypass = false;
+            }
+          }
+        }
+        if (bypass) {
+          Inst->replaceAllUsesWith(Inst->getOperandUse(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+          Modified = true;
+        }
+      }
+    }
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * processInst : process an instruction in the dead element removal pass
+ */
+void GenXDeadVectorRemoval::processInst(Instruction *Inst)
+{
+  LLVM_DEBUG(dbgs() << "  " << *Inst << "\n       has bits " << getLiveBits(Inst) << "\n");
+  if (isRootInst(Inst)) {
+    // This is a "root" instruction. Mark its inputs as wholly live.
+    for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi)
+      markWhollyLive(Inst->getOperand(oi));
+    return;
+  }
+  // Check for the result of the instruction not being used at all.
+  auto LB = getLiveBits(Inst);
+  if (!LB.getNumElements())
+    return;
+  // Handle phi node.
+  if (auto Phi = dyn_cast<PHINode>(Inst)) {
+    processElementwise(Phi, LB);
+    return;
+  }
+  // Special case for bitcast.
+  if (auto BC = dyn_cast<BitCastInst>(Inst)) {
+    processBitCast(BC, LB);
+    return;
+  }
+  // Check for element-wise instructions.
+  if (isa<BinaryOperator>(Inst) || isa<CastInst>(Inst)
+      || isa<SelectInst>(Inst) || isa<CmpInst>(Inst)) {
+    processElementwise(Inst, LB);
+    return;
+  }
+  // Check for rdregion and wrregion.
+  switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+    case GenXIntrinsic::genx_rdregionf:
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdpredregion:
+      processRdRegion(Inst, LB);
+      return;
+    case GenXIntrinsic::genx_wrregionf:
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrconstregion:
+    case GenXIntrinsic::genx_wrpredregion:
+      processWrRegion(Inst, LB);
+      return;
+    default:
+      break;
+  }
+  // For any other instruction, just mark all operands as wholly live.
+  for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi)
+    markWhollyLive(Inst->getOperand(oi));
+}
+
+/***********************************************************************
+ * processRdRegion : process a rdregion instruction for element liveness
+ */
+void GenXDeadVectorRemoval::processRdRegion(Instruction *Inst, LiveBits LB)
+{
+  auto InInst = dyn_cast<Instruction>(
+      Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+  Region R(Inst, BaleInfo());
+  if (R.Indirect) {
+    markWhollyLive(InInst);
+    markWhollyLive(Inst->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum));
+    return;
+  }
+  if (!InInst)
+    return;
+  // Set bits in InLB (InInst's livebits) for live elements read by the
+  // rdregion.
+  bool Modified = false;
+  LiveBits InLB = getLiveBits(InInst, /*Create=*/true);
+  for (unsigned RowIdx = R.Offset / R.ElementBytes, Row = 0,
+      NumRows = R.NumElements / R.Width; Row != NumRows;
+      RowIdx += R.VStride, ++Row)
+    for (unsigned Idx = RowIdx, Col = 0; Col != R.Width; Idx += R.Stride, ++Col)
+      if (LB.get(Row * R.Width + Col))
+        if (Idx < InLB.getNumElements())
+          Modified |= InLB.set(Idx);
+  if (Modified)
+    addToWorkList(InInst);
+}
+
+/***********************************************************************
+ * processWrRegion : process a wrregion instruction for element liveness
+ */
+void GenXDeadVectorRemoval::processWrRegion(Instruction *Inst, LiveBits LB)
+{
+  Region R(Inst, BaleInfo());
+  if (R.Mask)
+    markWhollyLive(Inst->getOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum));
+  auto NewInInst = dyn_cast<Instruction>(
+        Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+  if (R.Indirect) {
+    markWhollyLive(NewInInst);
+    markWhollyLive(Inst->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum));
+  } else if (NewInInst) {
+    // Set bits in NewInLB (NewInInst's livebits) for live elements read by
+    // the wrregion in the "new value" input.
+    bool Modified = false;
+    LiveBits NewInLB = getLiveBits(NewInInst, /*Create=*/true);
+    for (unsigned RowIdx = R.Offset / R.ElementBytes, Row = 0,
+        NumRows = R.NumElements / R.Width; Row != NumRows;
+        RowIdx += R.VStride, ++Row)
+      for (unsigned Idx = RowIdx, Col = 0; Col != R.Width;
+          Idx += R.Stride, ++Col)
+        if (Idx < LB.getNumElements() && LB.get(Idx))
+          Modified |= NewInLB.set(Row * R.Width + Col);
+    if (Modified)
+      addToWorkList(NewInInst);
+  }
+  // For the "old value" input, we want to see if any elements are used even if
+  // the input is a constant, since we want to be able to turn it into undef
+  // later on if it is not used. In the non-instruction case, OldInLB is left
+  // in a state where it contains no bits and OldInLB.getNumElements() is 0.
+  LiveBits OldInLB;
+  auto OldInInst = dyn_cast<Instruction>(
+        Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+  if (OldInInst)
+    OldInLB = getLiveBits(OldInInst, /*Create=*/true);
+  bool Modified = false;
+  bool UsedOldInput = false;
+  if (R.Indirect) {
+    if (OldInLB.getNumElements())
+      Modified = OldInLB.orBits(LB);
+    UsedOldInput = true;
+  } else {
+    // Set bits in OldLB (OldInInst's livebits) for live elements read by the
+    // wrregion in the "old value" input, excluding ones that come from the
+    // "new value" input.
+    unsigned NextRow = 0, NextCol = 0, NextIdx = R.Offset / R.ElementBytes,
+             NextRowIdx = NextIdx, NumRows = R.NumElements / R.Width;
+    for (unsigned Idx = 0, End = LB.getNumElements(); Idx != End; ++Idx) {
+      if (Idx == NextIdx) {
+        // This element comes from the "new value" input, unless the wrregion
+        // is predicated in which case it could come from either.
+        if (R.Mask && LB.get(Idx)) {
+          UsedOldInput = true;
+          if (OldInLB.getNumElements())
+            Modified |= OldInLB.set(Idx);
+        }
+        if (++NextCol == R.Width) {
+          if (++NextRow == NumRows)
+            NextIdx = End;
+          else
+            NextIdx = NextRowIdx += R.VStride;
+          NextCol = 0;
+        } else
+          NextIdx += R.Stride;
+      } else {
+        // This element comes from the "old value" input.
+        if (LB.get(Idx)) {
+          UsedOldInput = true;
+          if (OldInLB.getNumElements())
+            Modified |= OldInLB.set(Idx);
+        }
+      }
+    }
+  }
+  if (Modified)
+    addToWorkList(OldInInst);
+  if (UsedOldInput) {
+    // We know that at least one element of the "old value" input is used,
+    // so add the wrregion to the used old input set.
+    WrRegionsWithUsedOldInput.insert(Inst);
+  }
+}
+
+/***********************************************************************
+ * processBitCast : process a bitcast instruction for element liveness
+ */
+void GenXDeadVectorRemoval::processBitCast(Instruction *Inst, LiveBits LB)
+{
+  auto InInst = dyn_cast<Instruction>(Inst->getOperand(0));
+  if (!InInst)
+    return;
+  LiveBits InLB = getLiveBits(InInst, /*Create=*/true);
+  bool Modified = false;
+  if (InLB.getNumElements() == LB.getNumElements())
+    Modified = InLB.orBits(LB);
+  else if (InLB.getNumElements() > LB.getNumElements()) {
+    assert((InLB.getNumElements() % LB.getNumElements()) == 0);
+    int Scale = InLB.getNumElements() / LB.getNumElements();
+    // Input element is smaller than result element.
+    for (unsigned Idx = 0, End = LB.getNumElements(); Idx != End; ++Idx)
+      if (LB.get(Idx))
+        Modified |= InLB.setRange(Idx * Scale, Scale);
+  } else {
+    assert((LB.getNumElements() % InLB.getNumElements()) == 0);
+    int Scale = LB.getNumElements() / InLB.getNumElements();
+    // Input element is bigger than result element.
+    for (unsigned Idx = 0, End = InLB.getNumElements(); Idx != End; ++Idx) {
+      bool IsSet = false;
+      for (unsigned Idx2 = 0; Idx2 != Scale; ++Idx2)
+        IsSet |= LB.get(Idx*Scale | Idx2);
+      if (IsSet)
+        Modified |= InLB.set(Idx);
+    }
+  }
+  if (Modified)
+    addToWorkList(InInst);
+}
+
+/***********************************************************************
+ * processElementwise : process an element-wise instruction such as add or
+ *      a phi node
+ */
+void GenXDeadVectorRemoval::processElementwise(Instruction *Inst, LiveBits LB)
+{
+  for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) {
+    auto OpndInst = dyn_cast<Instruction>(Inst->getOperand(oi));
+    if (!OpndInst)
+      continue;
+    auto OpndLB = getLiveBits(OpndInst, /*Create=*/true);
+    if (isa<SelectInst>(Inst) && oi == 0 &&
+        !OpndInst->getType()->isVectorTy()) {
+      // First operand of select inst can be scalar, ignore it
+      markWhollyLive(OpndInst);
+      continue;
+    }
+
+    if (OpndLB.orBits(LB))
+      addToWorkList(OpndInst);
+  }
+}
+
+/***********************************************************************
+ * markWhollyLive : mark a value as wholly live (all elements live)
+ */
+void GenXDeadVectorRemoval::markWhollyLive(Value *V)
+{
+  auto Inst = dyn_cast_or_null<Instruction>(V);
+  if (!Inst)
+    return;
+  auto LB = getLiveBits(Inst, /*Create=*/true);
+  if (LB.setRange(0, LB.getNumElements()))
+    addToWorkList(Inst);
+}
+
+/***********************************************************************
+ * addToWorkList : add instruction to work list if not already there
+ *
+ * Enter:   Inst = the instruction
+ *
+ * This does not actually add to the work list in the initial scan through
+ * the whole code.
+ */
+void GenXDeadVectorRemoval::addToWorkList(Instruction *Inst)
+{
+  LLVM_DEBUG(dbgs() << "    " << Inst->getName() << " now " << getLiveBits(Inst) << "\n");
+  if (WorkListSet.insert(Inst).second && WorkListPhase) {
+    LLVM_DEBUG(dbgs() << "    adding " << Inst->getName() << " to work list\n");
+    WorkList.push(Inst);
+  }
+}
+
+/***********************************************************************
+ * getLiveBits : get the bitmap of live elements for the given instruction
+ *
+ * Return:  LiveBits object, which contains a pointer to the bitmap for
+ *          this instruction, and a size which is set to 0 if there is no
+ *          bitmap allocated yet for this instruction and Create is false
+ */
+LiveBits GenXDeadVectorRemoval::getLiveBits(Instruction *Inst, bool Create)
+{
+  unsigned NumElements = 1;
+  if (auto VT = dyn_cast<VectorType>(Inst->getType()))
+    NumElements = VT->getNumElements();
+  LiveBitsStorage *LBS = nullptr;
+  if (!Create) {
+    auto i = InstMap.find(Inst);
+    if (i == InstMap.end())
+      return LiveBits();
+    LBS = &i->second;
+  } else {
+    auto Ret = InstMap.insert(std::map<Instruction *,
+          LiveBitsStorage>::value_type(Inst, LiveBitsStorage()));
+    LBS = &Ret.first->second;
+    if (Ret.second) {
+      // New entry. Set its number of elements.
+      LBS->setNumElements(NumElements);
+    }
+  }
+  return LiveBits(LBS, NumElements);
+}
+
+/***********************************************************************
+ * LiveBits::isAllZero : return true if all bits zero
+ */
+bool LiveBits::isAllZero() const
+{
+  for (unsigned Idx = 0, End = (NumElements + BitsPerWord - 1) / BitsPerWord;
+      Idx != End; ++Idx)
+    if (P[Idx])
+      return false;
+  return true;
+}
+
+/***********************************************************************
+ * LiveBits::set : set (or clear) bit
+ *
+ * Enter:   Idx = element number
+ *          Val = true to set, false to clear, default true
+ *
+ * Return:  true if the bitmap changed
+ */
+bool LiveBits::set(unsigned Idx, bool Val)
+{
+  assert(Idx < NumElements);
+  uintptr_t *Ptr = P + Idx / BitsPerWord;
+  uintptr_t Bit = 1ULL << (Idx % BitsPerWord);
+  uintptr_t Entry = *Ptr;
+  if (Val)
+    Entry |= Bit;
+  else
+    Entry &= ~Bit;
+  bool Ret = Entry != *Ptr;
+  *Ptr = Entry;
+  return Ret;
+}
+
+/***********************************************************************
+ * LiveBits::copy : copy all bits from another LiveBits
+ */
+bool LiveBits::copy(LiveBits Src)
+{
+  assert(NumElements == Src.NumElements);
+  bool Modified = false;
+  for (unsigned Idx = 0, End = (NumElements + BitsPerWord - 1) / BitsPerWord;
+      Idx != End; ++Idx) {
+    Modified |= P[Idx] != Src.P[Idx];
+    P[Idx] = Src.P[Idx];
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * LiveBits::orBits : or all bits from another LiveBits into this one
+ */
+bool LiveBits::orBits(LiveBits Src)
+{
+  assert(NumElements == Src.NumElements);
+  bool Modified = false;
+  for (unsigned Idx = 0, End = (NumElements + BitsPerWord - 1) / BitsPerWord;
+      Idx != End; ++Idx) {
+    uintptr_t Word = P[Idx] | Src.P[Idx];
+    Modified |= P[Idx] != Word;
+    P[Idx] = Word;
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * LiveBits::setRange : set range of bits, returning true if any changed
+ */
+bool LiveBits::setRange(unsigned Start, unsigned Len)
+{
+  bool Modified = false;
+  unsigned End = Start + Len;
+  assert(End <= NumElements);
+  while (Start != End) {
+    unsigned ThisLen = BitsPerWord - (Start & (BitsPerWord - 1));
+    if (ThisLen > End - Start)
+      ThisLen = End - Start;
+    uintptr_t *Entry = P + (Start / BitsPerWord);
+    uintptr_t Updated = *Entry
+          | ((uintptr_t)-1LL >> (BitsPerWord - ThisLen))
+              << (Start & (BitsPerWord - 1));
+    if (Updated != *Entry) {
+      Modified = true;
+      *Entry = Updated;
+    }
+    Start += ThisLen;
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * LiveBits::print : debug print
+ */
+void LiveBits::print(raw_ostream &OS) const
+{
+  for (unsigned Idx = 0, End = getNumElements(); Idx != End; ++Idx)
+    OS << get(Idx);
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXDepressurizer.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXDepressurizer.cpp
new file mode 100644
index 000000000000..9b9a24d89690
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXDepressurizer.cpp
@@ -0,0 +1,1662 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXDepressurizer
+/// -----------------
+///
+/// GenXDepressurizer is a pass that identifies where register pressure is
+/// excessive, and attempts to sink and/or clone definitions past that area to
+/// reduce register pressure.
+///
+/// Currently the pass is enabled to handle only flag (predicate) values. It is
+/// supposed to work for general values, but that is not yet enabled and it may
+/// require some bug fixing and fine tuning before it is.
+///
+/// In fact this pass is now viewed as a dead end. The plan to replace it is a
+/// pass that does register allocation as if into Gen's real registers, doing
+/// live range splitting and rematerialization where required, to help undo the
+/// register-pressure-increasing effects of CSE and LICM where it would cause a
+/// spill.
+///
+/// The basic idea of the existing GenXDepressurizer pass:
+///
+/// 1. Scan the code backwards, keeping track of what values are live and what
+///    the register pressure is (total size of all live values, also the total
+///    size for flag (predicate) values).
+///
+/// 2. Where register pressure becomes excessive, look at currently live values
+///    to see if any is a definition that could profitably be sunk to below the
+///    current point.
+///
+/// 3. Sink any such instructions until register pressure is no longer
+///    excessive.
+///
+/// 4. For a flag value, "profitably be sunk" includes the case that it
+///    decreases flag register pressure but increases overall register pressure
+///    (by, for instance, lengthening the live ranges of the inputs to a cmp),
+///    but general register pressure is not high at the current point.
+///
+/// 5. A flag value that does not require cloning (all uses are dominated by the
+///    current point) is sunk anyway, as long as it does not push an already
+///    high general pressure up higher.
+///
+/// Point 5 means that this pass replaces GenXCodeSinking, which sank any single
+/// use flag value.
+///
+/// There are some complications to the scheme:
+///
+/// * How do we scan code backwards in a way that keeps track of pressure when
+///   there is control flow, particularly loops?
+///
+/// * When considering a definition to sink, we need to know whether a
+///   particular use is reachable from the current point, and whether it is
+///   dominated by it.
+///
+/// Backwards scanning order and pseudo CFG
+/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+///
+/// In order to keep track of liveness and pressure as we scan backwards, we
+/// want to scan the basic blocks in an order that ensures that we do not scan
+/// a particular basic block until we have scanned all its successors.  In that
+/// way we can easily gather the live out set of the basic block from the live
+/// in of each successor, modified by the incoming for our block in the phi
+/// nodes in the successor. (If there are phi nodes, there is only one
+/// successor, because critical edges have been split.)
+///
+/// A loop needs special consideration. We want to scan all of the blocks of a
+/// loop (including inner loops) in one go, after scanning all possible
+/// successors of the loop, and before scanning the predecessor(s) of the loop
+/// header. Within the loop, we want to start at the backedge predecessor(s),
+/// but we need to set up the liveness at the end of a backedge predecessor to
+/// take account of
+///
+/// a. any value that is live in to the loop and live out of the loop at some
+///    loop exit, and
+///
+/// b. any value that is defined in the loop and is live round the backedge.
+///
+/// Superbales
+/// ^^^^^^^^^^
+///
+/// Sinking is performed in units of a superbale.
+///
+/// For a general value, a superbale is the bale that defines the value, and,
+/// if that is a wrregion, the rest of the chain of wrregion bales that write
+/// to other parts of that value and have the same inputs as the defining bale.
+/// We consider such a superbale as a whole because considering and sinking
+/// just the bale would not show any benefit, because it has an input to the
+/// wrregion the same size as the result. Such a chain of wrregions typically
+/// arises from legalization where vector decomposing has not subsequently been
+/// able to split the big vector up.
+///
+/// For a flag value, a superbale is a tree where each non-leaf node is an
+/// and/or/xor/not instruction acting on predicates. Again this is done because
+/// sinking just an and/or/xor/not instruction would not show any benefit to
+/// flag pressure.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_DEPRESSURIZER"
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXGotoJoin.h"
+#include "GenXIntrinsics.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+
+static cl::opt<unsigned>
+    LimitGenXDepressurizer("limit-genx-depressurizer", cl::init(UINT_MAX),
+                           cl::Hidden, cl::desc("Limit GenX depressurizer."));
+
+STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumCloned, "Number of instructions cloned");
+
+namespace {
+
+// PseudoCFG : the pseudo control flow graph for a function
+//
+// The pseudo CFG is a graph of the basic blocks in a function, similar to the
+// real CFG, but with the following differences:
+//  * It is acyclic
+//  * Therefore there are no loop backedges
+//  * What was a loop backedge in the real CFG is replaced by special "loop
+//    exit" edges from what was the loop backedge predecessor of the loop
+//    header to each loop exit block, also pointing to the loop header.
+//  * This only works if the real CFG is reducible. Any unnatural loops in the
+//    real CFG are probably not properly represented in the pseudoCFG.
+//
+// The pseudo CFG also provides an ordering of blocks such that a block is not
+// visited until all its predecessors have been. Because of the above changes
+// in the pseudo CFG, this also has the property that, once we get to a
+// (natural) loop header, all blocks in the loop are processed before anything
+// else.
+//
+// The pseudo CFG provides a way to propagate liveness backwards through the
+// function:
+//  * Visit blocks in the reverse of the pseudo CFG ordering, such that no
+//    block is visited until all its successors have been, and no part of a
+//    loop is visited until all of the loop exits have been.
+//  * For a block:
+//    1. initialize the live out with (real CFG) successors' corresponding phi
+//       incomings;
+//    2. for a normal edge, propagate the successor's live in to this block's
+//       live out;
+//    3. for a "loop exit" edge, propagate the successor's live in to this
+//       block's live out, but only for values that are defined before the loop
+//       header, i.e. in a block that would be visited by the loop header in
+//       this visit order.
+//    This provides the correct liveness for any particular point within a loop
+//    for these cases:
+//      a. a value that is used after this point in the loop (from 2);
+//      b. a value that is live round any backedge reachable from this point
+//         (from 1);
+//      c. a value that is defined in the loop and used after the loop via a
+//         loop exit reachable from this point (from 2);
+//      d. a value that is defined before the loop and used after the loop, and
+//         is thus live through the whole loop (from 3).
+//
+// If the real CFG is irreducible, then this liveness information will be
+// inaccurate.
+//
+class PseudoCFG {
+public:
+  struct Node {
+    friend PseudoCFG;
+    SmallVector<BasicBlock *, 4> Preds;
+    SmallVector<BasicBlock *, 4> Succs;
+    BasicBlock *LoopHeader;
+    Node() : LoopHeader(nullptr) {}
+
+  public:
+    void removeSucc(BasicBlock *Succ);
+    void removePred(BasicBlock *Pred);
+    // getLoopHeader : normally returns 0. If this is a backedge node,
+    // returns the corresponding loop header block
+    BasicBlock *getLoopHeader() { return LoopHeader; }
+    // pred and succ iterators
+    typedef SmallVectorImpl<BasicBlock *>::iterator pred_iterator;
+    pred_iterator pred_begin() { return Preds.begin(); }
+    pred_iterator pred_end() { return Preds.end(); }
+    typedef SmallVectorImpl<BasicBlock *>::iterator succ_iterator;
+    succ_iterator succ_begin() { return Succs.begin(); }
+    succ_iterator succ_end() { return Succs.end(); }
+  };
+
+private:
+  std::vector<BasicBlock *> Ordering;
+  std::map<BasicBlock *, Node> Nodes;
+
+public:
+  void clear() {
+    Ordering.clear();
+    Nodes.clear();
+  }
+  // compute : compute the pseudo CFG for the function.
+  // It is assumed that critical edges have been split.
+  void compute(Function *F, DominatorTree *DT,
+               LoopInfoBase<BasicBlock, Loop> *LI);
+  // getNode : get pseudo CFG node for basic block
+  Node *getNode(BasicBlock *BB) { return &Nodes[BB]; }
+  // iterators through the ordering
+  typedef std::vector<BasicBlock *>::iterator iterator;
+  iterator begin() { return Ordering.begin(); }
+  iterator end() { return Ordering.end(); }
+  typedef std::vector<BasicBlock *>::reverse_iterator reverse_iterator;
+  reverse_iterator rbegin() { return Ordering.rbegin(); }
+  reverse_iterator rend() { return Ordering.rend(); }
+  // Debug dump/print
+  void dump() { print(dbgs()); }
+  void print(raw_ostream &OS);
+};
+
+// Liveness : the liveness information at some point in the program
+// This class is local to this source file and completely unrelated to
+// GenXLiveness.
+class Liveness {
+public:
+  enum Category { GENERAL, FLAG, ADDR, NUMCATS };
+
+private:
+  std::set<Value *> Values[NUMCATS];
+  unsigned Pressure; // overall register pressure
+  unsigned Pressures[NUMCATS]; // pressure for each individual category
+public:
+  Liveness() : Pressure(0) {
+    for (unsigned Cat = 0; Cat != NUMCATS; ++Cat)
+      Pressures[Cat] = 0;
+  }
+  static bool isFlag(Value *V) {
+    return V->getType()->getScalarType()->isIntegerTy(1);
+  }
+  static bool isAddr(Value *V) {
+    if (!V->getType()->getScalarType()->isIntegerTy(16))
+      return false;
+    switch (GenXIntrinsic::getGenXIntrinsicID(V)) {
+    case GenXIntrinsic::genx_convert_addr:
+    case GenXIntrinsic::genx_add_addr:
+      return true;
+    default:
+      break;
+    }
+    return false;
+  }
+  static unsigned getValueSize(Value *V);
+  void copyFrom(Liveness *Other);
+  void addValue(Value *V);
+  bool removeValue(Value *V);
+  void copyValues(Liveness *Other);
+  unsigned getPressure(unsigned Cat) { return Pressures[Cat]; }
+  unsigned getPressure() { return Pressure; }
+  bool contains(Value *V) {
+    auto ValueSet = &Values[GENERAL];
+    if (isFlag(V))
+      ValueSet = &Values[FLAG];
+    else if (isAddr(V))
+      ValueSet = &Values[ADDR];
+    return ValueSet->find(V) != ValueSet->end();
+  }
+  // Iterator (over set of values)
+  typedef std::set<Value *>::iterator iterator;
+  iterator begin(unsigned Cat) { return Values[Cat].begin(); }
+  iterator end(unsigned Cat) { return Values[Cat].end(); }
+  unsigned cat_begin() { return 0; }
+  unsigned cat_end() { return NUMCATS; }
+  // Debug print and dump
+  void print(raw_ostream &OS);
+  void dump() { print(dbgs()); dbgs() << '\n'; }
+};
+
+// Superbale : a sequence of bales where each is headed by a wrregion whose
+// "old value of vector" input is the previous bale, and the other operands of
+// the bales are all the same.
+struct Superbale {
+  // Instruction number of head instruction of superbale
+  unsigned Number;
+  // Bale head instructions, stored in reverse of code order
+  SmallVector<Instruction *, 8> Bales;
+  // Operands (some entries can be nullptr)
+  SmallVector<Value *, 8> Operands;
+  Instruction *getHead() { return Bales[0]; }
+  void print(raw_ostream &OS);
+  void dump() { print(dbgs()); dbgs() << '\n'; }
+};
+
+// SinkCandidate : a candidate superbale for sinking
+struct SinkCandidate {
+  Superbale *SB;
+  int Benefit;
+  bool AllUsesDominatedByHere;
+  SinkCandidate(Superbale *SB, int Benefit, bool AUDBH)
+      : SB(SB), Benefit(Benefit), AllUsesDominatedByHere(AUDBH) {}
+  // Sort by whether all uses are dominated by here, then by best benefit, then
+  // by latest definition point.
+  bool operator<(const SinkCandidate &Rhs) const {
+    if (AllUsesDominatedByHere != Rhs.AllUsesDominatedByHere)
+      return AllUsesDominatedByHere > Rhs.AllUsesDominatedByHere;
+    if (Benefit != Rhs.Benefit)
+      return Benefit > Rhs.Benefit;
+    if (SB == nullptr)
+      return false;
+    if (Rhs.SB == nullptr)
+      return true;
+    return SB->Number > Rhs.SB->Number;
+  }
+};
+
+// GenX depressurizer pass
+class GenXDepressurizer : public FunctionGroupPass {
+  enum { FlagThreshold = 6, AddrThreshold = 32, GRFThreshold = 2560,
+         FlagGRFTolerance = 3840 };
+  bool Modified;
+  GenXGroupBaling *Baling;
+  DominatorTree *DT;
+  LoopInfoBase<BasicBlock, Loop> *LI;
+  PseudoCFG *PCFG;
+  unsigned MaxPressure;
+  std::map<Function *, unsigned> SubroutinePressures;
+  std::map<BasicBlock *, Liveness> LiveIn;
+  std::map<BasicBlock *, Liveness> LiveOut;
+  Liveness *Live;
+  // A numbering of instructions. Because of the way the basic block ordering
+  // is constructed, if instruction I2 is reachable from instruction I1, then
+  // InstNumbers[I1] < InstNumbers[I2], unless the reachability is via a
+  // loop backedge. The converse is not necessarily true.
+  std::map<Instruction *, unsigned> InstNumbers;
+
+public:
+  static char ID;
+  explicit GenXDepressurizer() : FunctionGroupPass(ID) {}
+  StringRef getPassName() const override {
+    return "GenX register pressure reducer";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunctionGroup(FunctionGroup &FG) override;
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  Pass *createPrinterPass(raw_ostream &O,
+                          const std::string &Banner) const override {
+    return createGenXGroupPrinterPass(O, Banner);
+  }
+
+private:
+  void processFunction(Function *F);
+  void orderAndNumber(Function *F);
+  void processBasicBlock(BasicBlock *BB);
+  void getLiveOut(BasicBlock *BB, Liveness *Live);
+  void processInstruction(Instruction *Inst);
+  void attemptSinking(Instruction *InsertBefore, std::set<Value *> *Exclude,
+                      Liveness::Category Cat, bool AllowClone);
+  bool sink(Instruction *InsertBefore, Superbale *SB, bool AllowClone = false);
+  BasicBlock *sinkOnce(Instruction *InsertBefore, Superbale *SB,
+                       ArrayRef<Use *> Uses);
+  bool modifyLiveness(Liveness *Live, Superbale *SB);
+  int  getSuperbaleKillSize(Superbale *SB);
+  int  getSinkBenefit(Superbale *SB, Liveness::Category Cat, unsigned Headroom);
+  bool fillSuperbale(Superbale *SB, Instruction *Inst, bool IsFlag);
+  void MergeCandidate(SinkCandidate &Lhs, SinkCandidate &Rhs);
+};
+
+} // end anonymous namespace
+
+char GenXDepressurizer::ID = 0;
+namespace llvm {
+void initializeGenXDepressurizerPass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXDepressurizer, "GenXDepressurizer", "GenXDepressurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling)
+INITIALIZE_PASS_END(GenXDepressurizer, "GenXDepressurizer", "GenXDepressurizer", false, false)
+
+FunctionGroupPass *llvm::createGenXDepressurizerPass()
+{
+  initializeGenXDepressurizerPass(*PassRegistry::getPassRegistry());
+  return new GenXDepressurizer();
+}
+
+void GenXDepressurizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionGroupPass::getAnalysisUsage(AU);
+  AU.addRequired<DominatorTreeGroupWrapperPass>();
+  AU.addRequired<GenXGroupBaling>();
+  AU.addPreserved<DominatorTreeGroupWrapperPass>();
+  AU.addPreserved<GenXModule>();
+  AU.addPreserved<GenXLiveness>();
+  AU.addPreserved<GenXGroupBaling>();
+  AU.addPreserved<FunctionGroupAnalysis>();
+  AU.setPreservesCFG();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run the register pressure reduction pass for
+ *      this FunctionGroup
+ */
+bool GenXDepressurizer::runOnFunctionGroup(FunctionGroup &FG) {
+  if (skipOptWithLargeBlock(FG))
+    return false;
+
+  Modified = false;
+  Baling = &getAnalysis<GenXGroupBaling>();
+  // Process functions in the function group in reverse order, so we know the
+  // max pressure in a subroutine when we see a call to it.
+  for (auto fgi = FG.rbegin(), fge = FG.rend(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    processFunction(F);
+    SubroutinePressures[F] = MaxPressure;
+  }
+  SubroutinePressures.clear();
+  return Modified;
+}
+
+/***********************************************************************
+ * processFunction : run depressurizer on one function
+ */
+void GenXDepressurizer::processFunction(Function *F) {
+  LLVM_DEBUG(dbgs() << "GenXDepressurizer on function " << F->getName() << '\n');
+  MaxPressure = 0;
+  DT = getAnalysis<DominatorTreeGroupWrapperPass>().getDomTree(F);
+  LI = new LoopInfoBase<BasicBlock, Loop>();
+  LI->analyze(*DT);
+  // Calculate the pseudo CFG.
+  PCFG = new PseudoCFG();
+  PCFG->compute(F, DT, LI);
+  // Order and number the instructions.
+  orderAndNumber(F);
+  // Visit each basic block.
+  MaxPressure = 0;
+  for (auto ri = PCFG->rbegin(), re = PCFG->rend(); ri != re; ++ri) {
+    processBasicBlock(*ri);
+  }
+
+  delete PCFG;
+  delete LI;
+  LLVM_DEBUG(dbgs() << "max pressure " << MaxPressure << " for function "
+               << F->getName() << '\n');
+  SubroutinePressures[F] = MaxPressure;
+}
+
+/***********************************************************************
+ * orderAndNumber : order and number the instructions
+ *
+ * This has three purposes:
+ *
+ * 1. ensure the instructions in a bale are adjacent;
+ *
+ * 2. for a boolean and/or, ensure that a tree of bales (where each bale has
+ *    a single use that is its parent in the tree, in the same basic block) is
+ *    adjacent and in depth first order to minimize flag pressure in a tree of
+ *    boolean ops;
+ *
+ * 3. number the instructions, with each instruction in a bale given the same
+ *    number.
+ *
+ * This scans the code backwards, so numbers backwards starting at a high
+ * number.
+ */
+void GenXDepressurizer::orderAndNumber(Function *F) {
+  unsigned InstNum = 1000000000;
+  for (auto fi = PCFG->rbegin(), fe = PCFG->rend(); fi != fe; ++fi) {
+    BasicBlock *BB = *fi;
+    auto Inst = &BB->back();
+    for (;;) {
+      --InstNum;
+      if (isa<PHINode>(Inst))
+        InstNumbers[Inst] = InstNum;
+      else {
+        Bale B;
+        Baling->buildBale(Inst, &B);
+        auto InsertBefore = Inst;
+        // Move the bale instructions to a contiguous lump, and number them.
+        Instruction *GotoJoin = nullptr;
+        for (auto ii = B.begin(), ie = B.end(); ii != ie; ++ii) {
+          Inst = ii->Inst;
+          InstNumbers[Inst] = InstNum;
+          if (Inst == InsertBefore)
+            continue;
+          switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+          case GenXIntrinsic::genx_simdcf_goto:
+          case GenXIntrinsic::genx_simdcf_join:
+            GotoJoin = Inst;
+            break;
+          default:
+            break;
+          }
+          Inst->removeFromParent();
+          Inst->insertBefore(InsertBefore);
+        }
+        if (GotoJoin) {
+          // For a goto/join, check that its outside-bale uses are also moved,
+          // and number the instructions.
+          // This is the only case of an inside-bale instruction having
+          // outside-bale uses.
+          // This is a bit of a bodge, which we'll tolerate for now on the
+          // basis that this pass will go away once we have a better pass for
+          // detecting register pressure and alleviating it by moving code and
+          // rematerializing.
+          SmallVector<Instruction *, 2> Users;
+          for (auto ui = GotoJoin->use_begin(), ue = GotoJoin->use_end();
+               ui != ue; ++ui)
+            Users.push_back(cast<Instruction>(ui->getUser()));
+          Instruction *InsertBefore = GotoJoin->getNextNode();
+          for (auto ui = Users.begin(), ue = Users.end(); ui != ue; ++ui) {
+            Instruction *User = *ui;
+            if (!isa<VectorType>(User->getType())) {
+              // Skip the use that is in the bale. We are relying on the use in
+              // the bale being the only extractvalue that is scalar; the other
+              // two (for goto) or one (for join) are vector (the EM and RM
+              // values).
+              continue;
+            }
+            if (User->getParent() == GotoJoin->getParent()) {
+              // Only move the extractvalue if it is in the same basic block.
+              User->removeFromParent();
+              User->insertBefore(InsertBefore);
+              InstNumbers[User] = InstNum;
+            }
+          }
+        }
+        Inst = B.getHead()->Inst;
+        if (Inst->getType()->getScalarType()->isIntegerTy(1) &&
+            (Inst->getOpcode() == Instruction::And ||
+             Inst->getOpcode() == Instruction::Or)) {
+          // Now look at the operands. Any that is a single use instruction in
+          // the same basic block is moved. The rest of its bale, and that
+          // bale's own operands, get moved when it is later processed in the
+          // loop.
+          InsertBefore = B.begin()->Inst;
+          for (auto ii = B.begin(), ie = B.end(); ii != ie; ++ii) {
+            Inst = ii->Inst;
+            for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) {
+              if (ii->Info.isOperandBaled(oi))
+                continue; // only consider out-of-bale operands
+              auto OpndInst = dyn_cast<Instruction>(Inst->getOperand(oi));
+              if (!OpndInst)
+                continue;
+              if (OpndInst->getParent() != BB)
+                continue;
+              if (isa<PHINode>(OpndInst))
+                continue;
+              if (!OpndInst->hasOneUse())
+                continue;
+              OpndInst->removeFromParent();
+              OpndInst->insertBefore(InsertBefore);
+            }
+          }
+        }
+        // On to the previous instruction, which is now the one before the first
+        // instruction in the current bale.
+        Inst = B.begin()->Inst;
+      }
+      if (Inst == &BB->front())
+        break;
+      Inst = Inst->getPrevNode();
+    }
+  }
+}
+
+/***********************************************************************
+ * processBasicBlock : process one basic block
+ */
+void GenXDepressurizer::processBasicBlock(BasicBlock *BB) {
+  // Create a new empty entry for this BB in the LiveIn map, and use it for
+  // keeping track of liveness as we scan backwards through the block.
+  Live = &LiveIn[BB];
+  // Populate Live with the live out values.
+  getLiveOut(BB, Live);
+  // Scan backwards through the block, excluding phi nodes.
+  auto Inst = &BB->back();
+  for (;;) {
+    if (isa<PHINode>(Inst))
+      break;
+    processInstruction(Inst);
+    if (Inst == &BB->front())
+      break;
+    Inst = Inst->getPrevNode();
+  }
+  // Just before the first (non-phi) instruction, attempt sinking of flag
+  // values, as long as non-flag pressure is low, and as long as this is not a
+  // join label.
+  if (!GotoJoin::isJoinLabel(BB) && FlagGRFTolerance > Live->getPressure())
+    attemptSinking(BB->getFirstNonPHI(), nullptr, Liveness::FLAG,
+                   /*AllowClone=*/false);
+}
+
+/***********************************************************************
+ * getLiveOut : populate empty Liveness with the live out of the BB
+ */
+void GenXDepressurizer::getLiveOut(BasicBlock *BB, Liveness *Live) {
+  // Get each successor's live in values into our liveness. If getLoopHeader
+  // returns non-0, then we are looking at a loop backedge and we only want
+  // to get successors' live in values if they are defined before the loop
+  // header.
+  unsigned LoopHeaderNum = 0;
+  auto BBNode = PCFG->getNode(BB);
+  if (auto LoopHeader = BBNode->getLoopHeader())
+    LoopHeaderNum = InstNumbers[&LoopHeader->front()];
+  for (auto si = BBNode->succ_begin(), se = BBNode->succ_end(); si != se;
+       ++si) {
+    auto LI = &LiveIn[*si];
+    for (auto ci = LI->cat_begin(), ce = LI->cat_end(); ci != ce; ++ci)
+      for (auto vi = LI->begin(ci), ve = LI->end(ci); vi != ve; ++vi) {
+        Value *V = *vi;
+        if (auto Inst = dyn_cast<Instruction>(V))
+          if (LoopHeaderNum && LoopHeaderNum <= InstNumbers[Inst])
+            continue; // Ignore instruction defined in loop from loop exit succ
+        Live->addValue(V);
+      }
+  }
+  // Now adjust the liveness for the phi nodes of each real CFG successor. This
+  // includes the case that this is a backedge and the real CFG successor is
+  // the loop header; this is how we get defs inside the loop into our
+  // liveness.
+  auto TI = BB->getTerminator();
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+    auto Succ = TI->getSuccessor(i);
+    for (auto ii = Succ->begin();; ++ii) {
+      auto Phi = dyn_cast<PHINode>(&*ii);
+      if (!Phi)
+        break;
+      Live->removeValue(Phi);
+      Live->addValue(Phi->getIncomingValue(Phi->getBasicBlockIndex(BB)));
+    }
+  }
+  if (MaxPressure < Live->getPressure()) {
+    MaxPressure = Live->getPressure();
+    LLVM_DEBUG(dbgs() << "max pressure now " << MaxPressure << '\n');
+  }
+  LLVM_DEBUG(dbgs() << "getLiveOut(" << BB->getName() << "): "; Live->print(dbgs());
+        dbgs() << '\n');
+  // Copy the liveness to the LiveOut entry for this BB.
+  LiveOut[BB].copyFrom(Live);
+}
+
+/***********************************************************************
+ * processInstruction : process one instruction in backwards scan of BB
+ *
+ * Return:  Prev = previous instruction, i.e. next one to scan
+ */
+void GenXDepressurizer::processInstruction(Instruction *Inst) {
+  if (!Inst)
+    return;
+  if (Baling->isBaled(Inst))
+    return; // Not head of bale, ignore
+  if (isa<ExtractValueInst>(Inst))
+    return; // Too confusing to consider sinking when we get to an extractvalue
+            // out of a goto/join, so ignore.
+  Bale B;
+  Baling->buildBale(Inst, &B);
+  LLVM_DEBUG(dbgs() << '[' << InstNumbers[Inst] << ']';
+    if (!Inst->getDebugLoc())
+      dbgs() << " {line " << Inst->getDebugLoc().getLine() << '}';
+    B.print(dbgs()));
+  unsigned OldFlagPressure = Live->getPressure(Liveness::FLAG);
+  // Remove the result of the bale from liveness.
+  Live->removeValue(Inst);
+  // If this is a non-intrisic call, add the max pressure from inside the call.
+  if (auto CI = dyn_cast<CallInst>(Inst)) {
+    if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI)) {
+      LLVM_DEBUG(dbgs() << "pressure inside subroutine: "
+                   << SubroutinePressures[CI->getCalledFunction()] << '\n');
+      unsigned AddedPressure =
+          Live->getPressure() + SubroutinePressures[CI->getCalledFunction()];
+      if (MaxPressure < AddedPressure) {
+        MaxPressure = AddedPressure;
+        LLVM_DEBUG(dbgs() << "max pressure now " << MaxPressure << '\n');
+      }
+    }
+  }
+  // Add operands from outside the bale to liveness. Also keep them in a
+  // separate set for the use of attemptSinking.
+  std::set<Value *> BaleOperands;
+  for (auto bi = B.rbegin(), be = B.rend(); bi != be; ++bi) {
+    BaleInst *BI = &*bi;
+    for (unsigned ii = 0, ie = BI->Inst->getNumOperands(); ii != ie; ++ii) {
+      if (!BI->Info.isOperandBaled(ii)) {
+        Value *Opnd = BI->Inst->getOperand(ii);
+        if (isa<Argument>(Opnd) || isa<Instruction>(Opnd)) {
+          Live->addValue(Opnd);
+          BaleOperands.insert(Opnd);
+        }
+      }
+    }
+  }
+  LLVM_DEBUG(Live->print(dbgs()); dbgs() << '\n');
+  if (Inst && Inst->isTerminator())
+    return; // Do not attempt to sink past last instruction in block.
+
+  // FIXME: This does not deal with a subroutine call instruction, where
+  // pressure goes up during the call and then comes back down again on
+  // return. I think the last remaining flag spill in HEVCEnc_PB is because
+  // of this; a CSEd flag is live over a subroutine call but we do not notice
+  // that increased flag pressure inside the call should force the flag def
+  // to be cloned and sunk.
+
+  // Attempt sinking of flag values if necessary. Do not do that if non-flag
+  // pressure is already high. If flag pressure has just gone high, sink any
+  // flag value (with a benefit). Otherwise, only sink single use flag values.
+  if (FlagGRFTolerance > Live->getPressure()) {
+    bool AllowClone = OldFlagPressure <= FlagThreshold &&
+                      Live->getPressure(Liveness::FLAG) > FlagThreshold;
+    attemptSinking(Inst->getNextNode(), &BaleOperands, Liveness::FLAG,
+                   AllowClone);
+  }
+
+  // Attemp sinking of address values if necessary.
+  if (Live->getPressure(Liveness::ADDR) > AddrThreshold)
+    attemptSinking(Inst->getNextNode(), &BaleOperands, Liveness::ADDR,
+                   /*AllowClone=*/false);
+
+  // Attempt sinking of non-flag value(s) if necessary.
+  if (Live->getPressure() > GRFThreshold)
+    attemptSinking(Inst->getNextNode(), &BaleOperands, Liveness::GENERAL,
+                   /*AllowClone=*/false);
+
+  if (MaxPressure < Live->getPressure()) {
+    MaxPressure = Live->getPressure();
+    LLVM_DEBUG(dbgs() << "max pressure up to " << MaxPressure << '\n');
+  }
+}
+
+/***********************************************************************
+ * attemptSinking : attempt some sinking to reduce pressure
+ *
+ * Enter:   InsertBefore = instruction to insert sunk instruction before
+ *          Exclude = 0 else exclude any sink candidate in this set (used to
+ *                    exclude superbales used in the present bale)
+ *          FlagSinking = true to sink flags
+ *          AllowClone = true to sink anything suitable, false to only sink
+ *                       when cloning is not required, used to sink flag defs
+ *                       even when flag pressure is low.
+ *
+ * This is called in three different ways:
+ *
+ * FlagSinking, !AllowClone: sink any flag def whose uses are all dominated by
+ * the current position (quit if normal pressure gets too high)
+ *
+ * FlagSinking, AllowClone: sink any flag def, preferring ones that do not need
+ * a clone, but switch to !AllowClone mode once flag pressure is low enough
+ * (and quit if normal pressure gets too high)
+ *
+ * !FlagSinking, AllowClone: sink normal (non-flag) def if it provides a
+ * benefit to pressure, until pressure is low enough.
+ */
+void GenXDepressurizer::attemptSinking(Instruction *InsertBefore,
+                                       std::set<Value *> *Exclude,
+                                       Liveness::Category Cat,
+                                       bool AllowClone) {
+  LLVM_DEBUG(dbgs() << "attemptSinking(Cat=" << (Cat == Liveness::FLAG ? "flag" :
+                                            Cat == Liveness::ADDR ? "addr" :
+                                                                    "general")
+               << ", AllowClone=" << AllowClone << ")\n");
+  if (!InsertBefore)
+    return;
+  // Build two-addr operand -> instruction map for checking against two-addr
+  // instructions.
+  std::map<Value *, CallInst *> TwoAddrValueMap;
+    BasicBlock *BB = InsertBefore->getParent();
+  if (InsertBefore != &BB->front()) {
+    for (auto I = InsertBefore->getPrevNode(); I != &BB->front();
+      I = I->getPrevNode()) {
+      auto CI = dyn_cast<CallInst>(I);
+      if (!CI)
+        continue;
+      int OpndNum = getTwoAddressOperandNum(CI);
+      if (OpndNum < 0)
+        continue;
+      TwoAddrValueMap[I->getOperand(OpndNum)] = CI;
+    }
+  }
+  // Gather the currently live superbales with a sink benefit.
+  // Exclude any that is used in the present bale.
+  SmallVector<SinkCandidate, 8> Candidates;
+  SmallVector<SinkCandidate, 8> SecondRound;
+  std::map<Instruction *, Superbale> Superbales;
+  unsigned CurNumber = InstNumbers[InsertBefore];
+  int Headroom = 0;
+  switch (Cat) {
+  case Liveness::FLAG:
+    Headroom = FlagGRFTolerance - Live->getPressure();
+    break;
+  default:
+    break;
+  }
+  for (auto i = Live->begin(Cat), e = Live->end(Cat); i != e; ++i) {
+    if (Exclude && Exclude->find(*i) != Exclude->end())
+      continue;
+    auto Inst = dyn_cast<Instruction>(*i);
+    if (!Inst)
+      continue; // only instructions can sink, not args
+    if (isa<PHINode>(Inst))
+      continue; // cannot sink phi node
+    if (isa<ExtractValueInst>(Inst))
+      continue; // Don't sink extractvalue from a goto/join.
+    if (Inst->mayHaveSideEffects() || Inst->mayReadOrWriteMemory())
+      continue;
+    // For this candidate, determine where its uses are, one of these cases:
+    //
+    // 1. All uses are dominated by here. This is the preferred case as the
+    //    instruction can simply be sunk, with no cloning.
+    // 2. Not all uses are reachable from here, but all uses that are reachable
+    //    from here are dominated by here. This can be handled by a clone of
+    //    the instruction, where the cloned instruction takes on the uses that
+    //    are reachable from here.
+    // 3. Other cases. We do not handle that, although we could enhance it in
+    //    the future to handle this case by finding multiple sites to clone
+    //    the instruction to.
+    //
+    // We are using "has a higher instruction number than" as a proxy for "is
+    // reachable from", which in fact could include some uses that are not
+    // reachable.
+    bool AllUsesDominatedByHere = true;
+    bool AllReachableUsesDominatedByHere = true;
+    for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+      Instruction *user = cast<Instruction>(ui->getUser());
+      if (InstNumbers[user] < CurNumber) {
+        // Unreachable use.
+        AllUsesDominatedByHere = false;
+        continue;
+      }
+      if (InsertBefore->getParent() != user->getParent() &&
+          !DT->dominates(InsertBefore->getParent(), user->getParent())) {
+        AllReachableUsesDominatedByHere = false;
+        break;
+      }
+    }
+    if (!AllReachableUsesDominatedByHere)
+      continue; // exclude case 3
+    if (!AllowClone && !AllUsesDominatedByHere)
+      continue; // exclude case 2 if !AllowClone
+    bool IsFlag = Liveness::isFlag(Inst);
+    bool IsAddr = Liveness::isAddr(Inst);
+    if (!IsFlag && !IsAddr &&
+        Inst->getType()->getPrimitiveSizeInBits() < 32 * 8) {
+      // don't bother with anything smaller than a GRF unless it is a flag
+      continue;
+    }
+    Superbale *SB = &Superbales[Inst];
+    assert(SB->Bales.empty());
+    if (!fillSuperbale(SB, Inst, IsFlag))
+      continue;
+    // Check whether the sink of this SB will cross its operands' two-addr
+    // instructions, i.e.
+    //
+    // ... := use(v0); // SB.Head
+    //
+    // v1  := twoaddr(v0); // two-addr intruction.
+    //
+    // x <--- here this SB could be sunk to.
+    //
+    // In such case, sinking this SB should be avoided as it creates
+    // overlapping between v0 and v1; otherwise, additional copy of v0 has to
+    // be inserted. That won't alleviate the register pressure.
+    bool CrossTwoAddr = false;
+    for (auto OI = SB->Operands.begin(),
+              OE = SB->Operands.end(); OI != OE; ++OI) {
+      Value *Opnd = *OI;
+      if (!TwoAddrValueMap.count(Opnd))
+        continue;
+      unsigned TwoAddrNum = InstNumbers[TwoAddrValueMap[Opnd]];
+      unsigned SBNum = InstNumbers[SB->getHead()];
+      // Ignore the case where the SB itself is a two-addr instruction or part
+      // of chain of two-addr instructions.
+      if (TwoAddrNum <= SBNum)
+        continue;
+      // Skip sinking/cloning if the current sinking point is beyond where the
+      // two-addr instruction overwriting the same register.
+      if (CurNumber > TwoAddrNum) {
+        LLVM_DEBUG(dbgs() << "could not sink/clone as it will cross the two-addr "
+                     << "instruction sharing the same operand!");
+        CrossTwoAddr = true;
+        break;
+      }
+    }
+    if (CrossTwoAddr)
+      continue;
+    // Add the candidate.
+    int Benefit = getSinkBenefit(SB, Cat, Headroom);
+    LLVM_DEBUG(dbgs() << "candidate " << SB->getHead()->getName()
+                 << " with benefit " << Benefit
+                 << " and AllUsesDominatedByHere " << AllUsesDominatedByHere
+                 << '\n');
+    if (Benefit > 0)
+      Candidates.push_back(SinkCandidate(SB, Benefit, AllUsesDominatedByHere));
+    else if (AllUsesDominatedByHere)
+      SecondRound.push_back(SinkCandidate(SB, Benefit, true));
+  }
+  if (!Candidates.empty()) {
+    // Sort the candidates.
+    std::sort(Candidates.begin(), Candidates.end());
+    // Try each candidate.
+    for (auto i = Candidates.begin(), e = Candidates.end(); i != e; ++i) {
+      if (!AllowClone && !i->AllUsesDominatedByHere)
+        continue; // Ignore candidate that needs cloning if AllowClone has
+      // switched to false (i.e. flag pressure is low)
+      if (sink(InsertBefore, i->SB)) {
+        switch (Cat) {
+        case Liveness::FLAG:
+          if (Live->getPressure(Liveness::FLAG) <= FlagThreshold) {
+            // Flag pressure is now low so we can stop sinking when a clone
+            // is needed.
+            AllowClone = false;
+          }
+          Headroom = FlagGRFTolerance - Live->getPressure();
+          if (Headroom <= 0)
+            return;
+          break;
+        case Liveness::ADDR:
+          if (Live->getPressure(Liveness::ADDR) < AddrThreshold)
+            return;
+          break;
+        default:
+          if (Live->getPressure() < GRFThreshold)
+            return;
+          break;
+        }
+      } else if (i->AllUsesDominatedByHere) {
+        SecondRound.push_back(*i);
+      }
+    }
+  }
+  if (AllowClone) {
+    LLVM_DEBUG(dbgs() << "could not do enough sinking to alleviate pressure\n");
+    if (Cat == Liveness::FLAG) {
+      for (auto i = Candidates.begin(), e = Candidates.end(); i != e; ++i) {
+        (void)sink(InsertBefore, i->SB, true);
+      }
+    }
+  } else {
+    // Try to sink a group of candidates to reduce register pressure.
+    // Do NOT Allow Clone for now.
+    for (auto i = SecondRound.begin(), ie = SecondRound.end(); i != ie; ++i) {
+      if (i->SB == nullptr)
+        continue;
+      auto SB = i->SB;
+      SmallSet<Value *, 8> OperandSet;
+      for (auto k = SB->Operands.begin(), ke = SB->Operands.end(); k != ke; ++k)
+        OperandSet.insert(*k);
+      // find a group that shares the same input
+      auto j = i;
+      for (++j; j != ie; ++j) {
+        if (j->SB == nullptr)
+          continue;
+        auto SB2 = j->SB;
+        bool EqualInputs = (SB2->Operands.size() == SB->Operands.size());
+        for (auto k = SB2->Operands.begin(), ke = SB2->Operands.end();
+             EqualInputs && k != ke; ++k) {
+          if (OperandSet.count(*k) == 0)
+            EqualInputs = false;
+        }
+        // merge superbale if i covers j
+        if (EqualInputs) {
+          MergeCandidate(*i, *j);
+        }
+      }
+    }
+    // Sort the candidates.
+    std::sort(SecondRound.begin(), SecondRound.end());
+    // Try each candidate.
+    for (auto i = SecondRound.begin(), e = SecondRound.end(); i != e; ++i) {
+      if (i->Benefit <= 0 || i->SB == nullptr)
+        break;
+      bool status = sink(InsertBefore, i->SB);
+      assert(status);
+      (void)status;
+    }
+  }
+}
+
+// Merge the Rhs into the Lhs candidate assuming that Rhs input operands
+// are covered by the Lhs candidate
+void GenXDepressurizer::MergeCandidate(SinkCandidate &Lhs, SinkCandidate &Rhs) {
+  // update the benefit
+  Lhs.Benefit += getSuperbaleKillSize(Rhs.SB);
+  // merge superbale
+  SmallVector<Instruction *, 8> Merge;
+  auto a = Lhs.SB->Bales.begin();
+  auto ae = Lhs.SB->Bales.end();
+  auto b = Rhs.SB->Bales.begin();
+  auto be = Rhs.SB->Bales.end();
+  while (1) {
+    if (a == ae && b == be)
+      break;
+    if (b == be) {
+      Merge.push_back(*a);
+      ++a;
+    } else if (a == ae) {
+      Merge.push_back(*b);
+      ++b;
+    } else if (InstNumbers[*b] > InstNumbers[*a]) {
+      Merge.push_back(*b);
+      ++b;
+    } else {
+      Merge.push_back(*a);
+      ++a;
+    }
+  }
+  Lhs.SB->Number = InstNumbers[Merge[0]];
+  std::swap(Lhs.SB->Bales, Merge);
+  Rhs.SB = nullptr;
+  Rhs.Benefit = (-1);
+}
+
+/***********************************************************************
+ * sink : sink the superbale if possible
+ *
+ * Enter:   InsertBefore = instruction to insert before
+ *          SB = the superbale to sink
+ *
+ * Return:  whether succeeded
+ */
+bool GenXDepressurizer::sink(Instruction *InsertBefore, Superbale *SB,
+                             bool AllowClone) {
+  static unsigned Count = 0;
+  if (++Count > LimitGenXDepressurizer)
+    return false;
+  if (LimitGenXDepressurizer != UINT_MAX)
+    dbgs() << "genx depressurizer " << Count << '\n';
+  unsigned CurNumber = InstNumbers[InsertBefore];
+  LLVM_DEBUG(dbgs() << "sink(" << SB->getHead()->getName() << ")\n");
+  // Gather the uses that we are going to modify.
+  SmallVector<Use *, 4> UsesDominatedByHere;
+  for (auto ui = SB->getHead()->use_begin(), ue = SB->getHead()->use_end();
+       ui != ue; ++ui) {
+    Use *U = &*ui;
+    Instruction *user = cast<Instruction>(U->getUser());
+    LLVM_DEBUG(dbgs() << " used in [" << InstNumbers[user] << "] "
+                 << user->getName() << '\n');
+    unsigned UserNumber = InstNumbers[user];
+    if (UserNumber < CurNumber) {
+      // Skip this user if cloning is allowed.
+      if (AllowClone)
+        continue;
+      LLVM_DEBUG(dbgs() << "  rejecting: less than CurNumber " << CurNumber << '\n');
+      // This code was originally designed to cope with some uses not being
+      // dominated by the sink site by cloning the superbale. But this gives an
+      // assert on frc_iteration6_4x8_ipa. So I am disabling the cloning
+      // functionality for now by rejecting the whole sink unless all uses are
+      // dominated by the sink site. This also gives a few minor code size
+      // improvements in examples too.
+      return false;
+    }
+    UsesDominatedByHere.push_back(U);
+  }
+  if (UsesDominatedByHere.empty())
+    return false;
+  // Do the sinking.
+  BasicBlock *DefBB = sinkOnce(InsertBefore, SB, UsesDominatedByHere);
+  assert(DefBB == InsertBefore->getParent());
+  (void)DefBB;
+  // We need to modify liveness at the current point.
+  modifyLiveness(Live, SB);
+  LLVM_DEBUG(dbgs() << "Successfully sunk "<< SB->getHead()->getName() << '\n';
+    Live->print(dbgs());
+    dbgs() << '\n');
+  return true;
+}
+
+/***********************************************************************
+ * sinkOnce : do one sinking of a superbale for a group of uses
+ *
+ * Enter:   InsertBefore = instruction to insert before
+ *          SB = superbale to sink
+ *          Uses = uses in the group
+ *
+ * Return:  basic block where sunk superbale was inserted
+ *
+ * Currently this only copes with the case that the uses are all dominated
+ * by InsertBefore, and the moved or cloned def is inserted before InsertBefore
+ * and the function returns the basic block containing InsertBefore.
+ *
+ * However it could be extended to sink for a group of uses that are not
+ * dominated by InsertBefore but are reachable from it. Then it would insert
+ * the def at a place that is a common dominator of the uses, and return that
+ * basic block.
+ */
+BasicBlock *GenXDepressurizer::sinkOnce(Instruction *InsertBefore,
+                                        Superbale *SB, ArrayRef<Use *> Uses) {
+  LLVM_DEBUG(dbgs() << "sinkOnce with uses:";
+    for (auto i = Uses.begin(), e = Uses.end(); i != e; ++i)
+      dbgs() << " ["
+          << InstNumbers[cast<Instruction>((*i)->getUser())]
+          << ']' << (*i)->getUser()->getName();
+    dbgs() << '\n');
+  // Insert after the current instruction.
+  BasicBlock *InsertBB = InsertBefore->getParent();
+  unsigned InsertNum = InstNumbers[InsertBefore];
+  assert(InsertNum != 0);
+  LLVM_DEBUG(dbgs() << "InsertBefore: " << InsertBefore->getName() << '\n');
+  // Remove this group of uses from the superbale.
+  auto Undef = UndefValue::get(SB->getHead()->getType());
+  for (auto i = Uses.begin(), e = Uses.end(); i != e; ++i)
+    **i = Undef;
+  Instruction *Changed = nullptr;
+  if (SB->getHead()->use_empty()) {
+    // The superbale now has no uses. So we can simply move the instructions.
+    for (auto i = SB->Bales.rbegin(), e = SB->Bales.rend(); i != e; ++i) {
+      Bale B;
+      Baling->buildBale(*i, &B);
+      for (auto j = B.begin(), je = B.end(); j != je; ++j) {
+        Changed = j->Inst;
+        Changed->removeFromParent();
+        Changed->insertBefore(InsertBefore);
+        InstNumbers[Changed] = InsertNum - 1;
+        ++NumSunk;
+      }
+    }
+  } else {
+    // The superbale still has uses, so we need to clone it.
+    std::map<Instruction *, Instruction *> ClonedInsts;
+    for (auto i = SB->Bales.rbegin(), e = SB->Bales.rend(); i != e; ++i) {
+      Bale B;
+      Baling->buildBale(*i, &B);
+      Instruction *InstToClone = nullptr;
+      for (auto j = B.begin(), je = B.end(); j != je; ++j) {
+        InstToClone = j->Inst;
+        Changed = InstToClone->clone();
+        Changed->insertBefore(InsertBefore);
+        Changed->setName(InstToClone->getName() + ".cloned");
+        // Ensure new instruction has the same baling.
+        Baling->setBaleInfo(Changed, j->Info);
+        for (unsigned k = 0, ke = Changed->getNumOperands(); k != ke; ++k) {
+          if (auto O = dyn_cast<Instruction>(Changed->getOperand(k))) {
+            auto it = ClonedInsts.find(O);
+            if (it != ClonedInsts.end())
+              Changed->setOperand(k, it->second);
+          }
+        }
+        ClonedInsts[InstToClone] = Changed;
+        InstNumbers[Changed] = InsertNum - 1;
+        ++NumCloned;
+      }
+    }
+  }
+  // Change our uses to use the moved/cloned superbale.
+  for (auto i = Uses.begin(), e = Uses.end(); i != e; ++i)
+    **i = Changed;
+  if (Changed) {
+    LLVM_DEBUG(dbgs() << "Sunk/cloned superbale head is " << Changed->getName()
+               << '\n');
+  } else {
+    LLVM_DEBUG(dbgs() << "Warning: Changed is nullptr\n");
+  }
+  return InsertBB;
+}
+
+/***********************************************************************
+ * modifyLiveness : modify liveness (at some point) to reflect the sinking
+ *                  of the superbale past it
+ *
+ * Enter:   Live = the liveness to modify
+ *          SB = the superbale
+ *
+ * Return:  true if the result of the superbale was removed from liveness,
+ *          false if it was not live already
+ */
+bool GenXDepressurizer::modifyLiveness(Liveness *Live, Superbale *SB) {
+  // Remove the superbale's result from liveness.
+  for (auto i = SB->Bales.begin(), e = SB->Bales.end(); i != e; ++i) {
+    Live->removeValue(*i);
+  }
+  for (auto i = SB->Operands.begin(), e = SB->Operands.end(); i != e; ++i)
+    if (*i)
+      Live->addValue(*i);
+  return true;
+}
+
+int GenXDepressurizer::getSuperbaleKillSize(Superbale *SB) {
+  int sum = 0;
+  for (auto i = SB->Bales.rbegin(), e = SB->Bales.rend(); i != e; ++i) {
+    if (GenXIntrinsic::isWrRegion(*i))
+      sum += Liveness::getValueSize((*i)->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+    else
+      sum += Liveness::getValueSize(*i);
+  }
+  return sum;
+}
+
+/***********************************************************************
+ * getSinkBenefit : calculate the benefit of sinking this Superbale
+ *
+ * Enter:   SB = superbale to consider
+ *          Cat = category of value being sunk
+ *          Headroom = if flag sinking, the headroom in normal register
+ *                     pressure
+ *
+ * For normal (non-flag) sinking, the benefit is the size of the result
+ * minus the total size of the superbale's operands.
+ *
+ * For flag sinking, the benefit is the size of the flag result minus the
+ * total size of any flag operands to the superbale.
+ */
+int GenXDepressurizer::getSinkBenefit(Superbale *SB, Liveness::Category Cat,
+                                      unsigned Headroom) {
+  int Benefit = getSuperbaleKillSize(SB);
+  unsigned FlagOperandSize = 0, AddrOperandSize = 0, OperandSize = 0;
+  for (auto i = SB->Operands.begin(), e = SB->Operands.end(); i != e; ++i) {
+    Value *Operand = *i;
+    if (!Operand || isa<Constant>(Operand))
+      continue;
+    if (Live->contains(Operand))
+      continue;
+    if (Liveness::isFlag(Operand))
+      FlagOperandSize += Liveness::getValueSize(Operand);
+    else if (Liveness::isAddr(Operand))
+      AddrOperandSize += Liveness::getValueSize(Operand);
+    else
+      OperandSize += Liveness::getValueSize(Operand);
+  }
+  switch (Cat) {
+  case Liveness::FLAG:
+    return Benefit - FlagOperandSize; // Flag sinking.
+  case Liveness::ADDR:
+    return Benefit - AddrOperandSize; // Addr sinking.
+  default:
+    break;
+  }
+  return Benefit - OperandSize;
+}
+
+/***********************************************************************
+ * fillSuperbale : find a chain of instruction to move
+ *
+ * Return:  false is the chain has side-effect, cannot be moved.
+ *
+ * For a vector-of-i1 and/or/not instruction, the superbale contains the
+ * tree of boolean and/or/not instructions plus the bales for the cmp
+ * instructions that created the booleans.
+ *
+ * For a wrregion, the superbale contains the bale for each wrregion in
+ * the chain of wrregion bales with the same inputs.
+ *
+ * Otherwise, it contains just the present instruction's bale.
+ *
+ * A bale with an indirect operand also includes the address generating
+ * instruction(s) in the superbale, so that, where a superbale is cloned,
+ * we maintain the constraint that an address generating instruction has
+ * exactly one use between GenXCategory and GenXAddressCommoning.
+ */
+bool GenXDepressurizer::fillSuperbale(Superbale *SB, Instruction *Inst,
+                                      bool IsFlag) {
+  // This is a new Superbale. Gather the bale(s) that make the superbale,
+  // and record the operands. First get the out-of-bale operands of the bale
+  // headed by Inst. We do this in the order such that the "old value of
+  // vector" operand of any wrregion heading the bale is the first operand
+  // pushed into SB->Operands.
+  SB->Number = InstNumbers[Inst];
+  SB->Bales.push_back(Inst);
+  SmallSet<Value *, 8> OperandSet;
+  Bale B;
+  Baling->buildBale(Inst, &B);
+  bool OnlyRdWrRegion = true;
+  for (auto bi = B.rbegin(), be = B.rend(); bi != be; ++bi) {
+    BaleInst *BI = &*bi;
+    if (BI->Inst->mayHaveSideEffects() || BI->Inst->mayReadOrWriteMemory())
+      return false; // not safe to sink
+    if (OnlyRdWrRegion && // Only chk the following conds if still required.
+        !GenXIntrinsic::isWrRegion(BI->Inst) && !GenXIntrinsic::isRdRegion(BI->Inst) &&
+        !isa<BitCastInst>(BI->Inst) &&
+        GenXIntrinsic::getGenXIntrinsicID(BI->Inst) != GenXIntrinsic::genx_add_addr)
+      OnlyRdWrRegion = false;
+    for (unsigned oi = 0, oe = BI->Inst->getNumOperands(); oi != oe; ++oi) {
+      if (BI->Info.isOperandBaled(oi))
+        continue;
+      Value *Opnd = BI->Inst->getOperand(oi);
+      if (!isa<Instruction>(Opnd) && !isa<Argument>(Opnd))
+        continue;
+      if (OperandSet.insert(Opnd).second)
+        SB->Operands.push_back(Opnd);
+    }
+  }
+  if (OnlyRdWrRegion || SB->Operands.empty()) {
+    return false; // moving this kind of bale may mess up coalescing
+  }
+  if (IsFlag) {
+    // Boolean operation. For any boolean input, include the instruction that
+    // generates it in the bale, as long as this is the only use. A superbale
+    // is then potentially a tree of boolean operations combined by and/or/not,
+    // and then at each leaf of the tree a cmp or a chain of cmps linked by
+    // wrpredregion (i.e. multiple cmps writing to different parts of the same
+    // flag register).
+    for (unsigned i = 0; i != SB->Operands.size(); ++i) {
+      Inst = dyn_cast_or_null<Instruction>(SB->Operands[i]);
+      if (!Inst)
+        continue;
+      if (!Liveness::isFlag(Inst))
+        continue;
+      if (!Inst->hasOneUse())
+        continue;
+      if (isa<PHINode>(Inst))
+        continue;
+      Bale B2;
+      Baling->buildBale(Inst, &B2);
+      SB->Operands[i] = nullptr;
+      SB->Bales.push_back(Inst);
+      for (auto bi = B2.rbegin(), be = B2.rend(); bi != be; ++bi) {
+        BaleInst *BI = &*bi;
+        for (unsigned oi = 0, oe = BI->Inst->getNumOperands(); oi != oe; ++oi) {
+          if (BI->Info.isOperandBaled(oi))
+            continue;
+          Value *Opnd = BI->Inst->getOperand(oi);
+          if (OperandSet.insert(Opnd).second)
+            SB->Operands.push_back(Opnd);
+        }
+      }
+    }
+  } else if (GenXIntrinsic::isWrRegion(Inst)) {
+    // Non-boolean operation headed by a wrregion.
+    Value *Opnd0 = SB->Operands[0];
+    for (;;) {
+      if (!GenXIntrinsic::isWrRegion(Opnd0))
+        break;
+      Inst = cast<Instruction>(Opnd0);
+      if (!Inst->hasOneUse())
+        break;
+      // The "old value of vector" input is another wrregion. Check that all
+      // the operands are the same, except the "old value of vector" input
+      // to that one.
+      Bale B2;
+      Baling->buildBale(Inst, &B2);
+      Opnd0 = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+      for (auto bi = B2.rbegin(), be = B2.rend(); bi != be; ++bi) {
+        BaleInst *BI = &*bi;
+        for (unsigned oi = 0, oe = BI->Inst->getNumOperands(); oi != oe; ++oi) {
+          if (BI->Info.isOperandBaled(oi))
+            continue;
+          Value *Opnd = BI->Inst->getOperand(oi);
+          if (Opnd == Opnd0)
+            continue;
+          if (!isa<Instruction>(Opnd) && !isa<Argument>(Opnd))
+            continue;
+          if (OperandSet.insert(Opnd).second)
+            SB->Operands.push_back(Opnd);
+        }
+      }
+      SB->Bales.push_back(Inst);
+      // Replace the previous "old value of vector" in SB->Operands.
+      SB->Operands[0] = Opnd0; // Opnd0 could be "undef"
+    }
+  }
+  // Now check whether any operand is an address. If so, include the address
+  // generating instruction in the superbale, so that, where a superbale is
+  // cloned, we maintain the constraint that an address generating instruction
+  // has exactly one use between GenXCategory and GenXAddressCommoning.
+  for (unsigned oi = 0, oe = SB->Operands.size(); oi != oe; /*EMPTY*/) {
+    Value *Opnd = SB->Operands[oi];
+    switch (GenXIntrinsic::getGenXIntrinsicID(Opnd)) {
+    case GenXIntrinsic::genx_convert_addr:
+    case GenXIntrinsic::genx_add_addr: {
+      auto Addr = cast<Instruction>(Opnd);
+      SB->Bales.push_back(Addr);
+      SB->Operands[oi] = Addr->getOperand(0);
+      continue;
+    }
+    default:
+      break;
+    }
+    ++oi;
+  }
+  return true;
+}
+
+/***********************************************************************
+ * Superbale::print : debug print
+ */
+void Superbale::print(raw_ostream &OS) {
+  OS << "Superbale[" << Number << ']';
+  for (auto i = Bales.begin(), e = Bales.end(); i != e; ++i)
+    OS << ' ' << (*i)->getName();
+  OS << ", operands:";
+  for (auto i = Operands.begin(), e = Operands.end(); i != e; ++i)
+    if (*i)
+      OS << ' ' << (*i)->getName();
+}
+
+/***********************************************************************
+ * copyFrom : copy this Liveness from the other one
+ */
+void Liveness::copyFrom(Liveness *Other) {
+  for (auto ci = cat_begin(), ce = cat_end(); ci != ce; ++ci) {
+    Values[ci].clear();
+    Pressures[ci] = Other->Pressures[ci];
+  }
+  Pressure = Other->Pressure;
+  copyValues(Other);
+}
+
+/***********************************************************************
+ * getValueSize : get the byte size of a value
+ *
+ * We round up to an even number of bytes as that's what we need for counting
+ * flag pressure, and we may as well do the same for normal pressure.
+ */
+unsigned Liveness::getValueSize(Value *V) {
+  if (isAddr(V))
+    switch (GenXIntrinsic::getGenXIntrinsicID(V)) {
+    case GenXIntrinsic::genx_add_addr:
+      return 0;
+    default:
+      break;
+    }
+  return (V->getType()->getPrimitiveSizeInBits() + 15) / 8U & -2U;
+}
+
+/***********************************************************************
+ * Liveness::addValue : add value to this liveness
+ */
+void Liveness::addValue(Value *V) {
+  auto Cat = GENERAL;
+  if (isFlag(V))
+    Cat = FLAG;
+  else if (isAddr(V))
+    Cat = ADDR;
+  if (Values[Cat].insert(V).second) {
+    Pressure += getValueSize(V);
+    Pressures[Cat] += getValueSize(V);
+  }
+}
+
+/***********************************************************************
+ * Liveness::removeValue : remove value from this liveness
+ *
+ * Return:  true if the value was removed, false if it was not live anyway
+ */
+bool Liveness::removeValue(Value *V) {
+  auto Cat = GENERAL;
+  if (isFlag(V))
+    Cat = FLAG;
+  else if (isAddr(V))
+    Cat = ADDR;
+  if (!Values[Cat].erase(V))
+    return false;
+  Pressure -= getValueSize(V);
+  Pressures[Cat] -= getValueSize(V);
+  return true;
+}
+
+/***********************************************************************
+ * Liveness::copyValues : copy values from Other into this liveness
+ */
+void Liveness::copyValues(Liveness *Other) {
+  for (auto ci = cat_begin(), ce = cat_end(); ci != ce; ++ci) {
+    for (auto i = Other->Values[ci].begin(), e = Other->Values[ci].end();
+         i != e; ++i)
+      addValue(*i);
+  }
+}
+
+/***********************************************************************
+ * Liveness::print : debug print
+ */
+void Liveness::print(raw_ostream &OS) {
+  OS << "[addrpressure=" << Pressures[ADDR]
+     << ",flagpressure=" << Pressures[FLAG] << ",pressure=" << Pressure << ']';
+  for (unsigned Cat = NUMCATS; Cat--; /*EMPTY*/) {
+    if (!Values[Cat].empty()) {
+      const char *CatName = (Cat == FLAG ? "flag." :
+                             Cat == ADDR ? "addr." : "");
+      OS << ' ' << CatName << "live:";
+      for (auto i = begin(Cat), e = end(Cat); i != e; ++i)
+        OS << ' ' << (*i)->getName();
+    }
+  }
+}
+
+/***********************************************************************
+ * PseudoCFG::Node::removeSucc : remove block from the node's successor
+ *      list (if it is in the list at all)
+ *
+ * This is only used when removing edges to unstick ourselves when there is
+ * irreducible flow.
+ */
+void PseudoCFG::Node::removeSucc(BasicBlock *Succ) {
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+    if (Succ == Succs[i]) {
+      Succs[i] = Succs[Succs.size() - 1];
+      Succs.pop_back();
+      break;
+    }
+  }
+}
+
+/***********************************************************************
+ * PseudoCFG::Node::removePred : remove block from the node's predecessor
+ *      list (if it is in the list at all)
+ *
+ * The only case when this is possibly called with a Pred that is not on the
+ * list is when attempting to remove loop backedges but flow is irreducible.
+ * This happens in compute_first_def_bug_5.
+ */
+void PseudoCFG::Node::removePred(BasicBlock *Pred) {
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    if (Pred == Preds[i]) {
+      Preds[i] = Preds[Preds.size() - 1];
+      Preds.pop_back();
+      break;
+    }
+  }
+}
+
+/***********************************************************************
+ * PseudoCFG::compute : compute the pseudo CFG for the function
+ */
+void PseudoCFG::compute(Function *F, DominatorTree *DT,
+                        LoopInfoBase<BasicBlock, Loop> *LI) {
+  clear();
+  // Initialize the graph to the same as the CFG. While we're scanning the
+  // CFG, remember the natural loop backedges.
+  SmallVector<BasicBlock *, 4> Backedges;
+  for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+    BasicBlock *BB = &*fi;
+    auto TI = BB->getTerminator();
+    // Remember BB if it is a backedge.
+    if (TI->getNumSuccessors() == 1 && DT->dominates(TI->getSuccessor(0), BB))
+      Backedges.push_back(BB);
+    // Add the edges out of BB.
+    auto BBNode = getNode(BB);
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *Succ = TI->getSuccessor(i);
+      BBNode->Succs.push_back(Succ);
+      getNode(Succ)->Preds.push_back(BB);
+    }
+  }
+  // For each natural loop backedge, remove the backedge and add the loop
+  // exit edges. This is all the changes we need to make if the CFG is
+  // reducible. If it is irreducible, we will need to remove more edges
+  // when we derive the ordering.
+  for (unsigned i = 0, e = Backedges.size(); i != e; ++i) {
+    BasicBlock *BB = Backedges[i];
+    auto BBNode = getNode(BB);
+    assert(BBNode->Succs.size() == 1 &&
+           "expecting backedge to have one successor "
+           "as we have split critical edges");
+    BasicBlock *Header = BBNode->Succs[0];
+    BBNode->LoopHeader = Header;
+    BBNode->Succs.clear(); // This removes Header as BB's only successor.
+    getNode(Header)->removePred(BB);
+    Loop *L = LI->getLoopFor(Header);
+    SmallVector<BasicBlock *, 4> ExitBlocks;
+    assert(L);
+    L->getExitBlocks(ExitBlocks);
+    for (unsigned j = 0, je = ExitBlocks.size(); j != je; ++j) {
+      BasicBlock *Exit = ExitBlocks[j];
+      BBNode->Succs.push_back(Exit);
+      getNode(Exit)->Preds.push_back(BB);
+    }
+  }
+  // Derive the ordering.
+  std::map<BasicBlock *, unsigned> Pending;
+  SmallVector<BasicBlock *, 4> Ready;
+  std::set<BasicBlock *> Done;
+  Ready.push_back(&F->front());
+  for (;;) {
+    if (Ready.empty()) {
+      if (Pending.empty())
+        break; // finished
+      // We have got stuck. The CFG must be irreducible. Unstick ourselves
+      // by choosing the pending block that is earliest in the function,
+      // removing any pending edges from it, and making it ready.
+      BasicBlock *BB = nullptr;
+      for (auto fi = F->begin();; ++fi) {
+        BB = &*fi;
+        if (Pending.find(BB) != Pending.end())
+          break;
+      }
+      std::set<BasicBlock *> UnseenPreds;
+      for (auto ui = BB->use_begin(), ue = BB->use_end(); ui != ue; ++ui) {
+        auto Pred = cast<Instruction>(ui->getUser())->getParent();
+        if (Done.find(Pred) != Done.end())
+          continue;
+        UnseenPreds.insert(Pred);
+      }
+      for (auto i = UnseenPreds.begin(), e = UnseenPreds.end(); i != e; ++i) {
+        getNode(BB)->removePred(*i);
+        getNode(*i)->removeSucc(BB);
+      }
+      Pending.erase(BB);
+      Ready.push_back(BB);
+      continue;
+    }
+    // Pop a ready block off the stack.
+    auto BB = Ready.back();
+    Ready.pop_back();
+    Ordering.push_back(BB);
+    // For each successor, decrement the pending count. If it becomes 0, the
+    // successor becomes ready.
+    auto BBNode = getNode(BB);
+    for (auto si = BBNode->succ_begin(), se = BBNode->succ_end();
+         si != se; ++si) {
+      BasicBlock *Succ = *si;
+      auto PendingEntry = &Pending[Succ];
+      if (!*PendingEntry) {
+        // New entry in the pending map. Count the predecessors.
+        for (auto pi = getNode(Succ)->pred_begin(),
+                  pe = getNode(Succ)->pred_end(); pi != pe; ++pi)
+          ++*PendingEntry;
+      }
+      if (--*PendingEntry)
+        continue;
+      // Successor needs to become ready.
+      Pending.erase(Succ);
+      Ready.push_back(Succ);
+    }
+  }
+}
+
+/***********************************************************************
+ * PseudoCFG::print : print the pseudo-CFG
+ */
+void PseudoCFG::print(raw_ostream &OS) {
+  OS << "PseudoCFG:\n";
+  for (auto i = Ordering.begin(), e = Ordering.end(); i != e; ++i) {
+    auto BB = *i;
+    auto Node = getNode(BB);
+    OS << BB->getName();
+    if (Node->LoopHeader)
+      OS << " loop header " << Node->LoopHeader->getName();
+    OS << "\n  preds:";
+    for (auto pi = Node->pred_begin(), pe = Node->pred_end(); pi != pe; ++pi)
+      OS << ' ' << (*pi)->getName();
+    OS << "\n  succs:";
+    for (auto si = Node->succ_begin(), se = Node->succ_end(); si != se; ++si)
+      OS << ' ' << (*si)->getName();
+    OS << '\n';
+  }
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXEmulate.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXEmulate.cpp
new file mode 100644
index 000000000000..3c6102f47e53
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXEmulate.cpp
@@ -0,0 +1,174 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXEmulate
+/// -----------
+///
+/// GenXEmulate is a mudule pass that emulates certain LLVM IR instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "GenX.h"
+#include "GenXSubtarget.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+class GenXEmulate : public ModulePass {
+  // Maps <opcode, type> to its corresponding emulation function.
+  using OpType = std::pair<unsigned, Type *>;
+  std::map<OpType, Function *> EmulationFuns;
+  const GenXSubtarget * ST = nullptr;
+
+public:
+  static char ID;
+  explicit GenXEmulate() : ModulePass(ID) {}
+  virtual StringRef getPassName() const { return "GenX emulation"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnModule(Module &M);
+  bool runOnFunction(Function &F);
+private:
+  bool emulateInst(Instruction *Inst);
+  Function *getEmulationFunction(Instruction *Inst);
+  // Check if a function is to emulate instructions.
+  static bool isEmulationFunction(const Function* F) {
+    if (F->empty())
+      return false;
+    if (F->hasFnAttribute("CMBuiltin"))
+      return true;
+    // FIXME: The above attribute is lost during SPIR-V translation.
+    if (F->getName().contains("__cm_intrinsic_impl_"))
+      return true;
+    return false;
+  }
+};
+
+} // end namespace
+
+char GenXEmulate::ID = 0;
+namespace llvm {
+void initializeGenXEmulatePass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXEmulate, "GenXEmulate", "GenXEmulate", false, false)
+INITIALIZE_PASS_END(GenXEmulate, "GenXEmulate", "GenXEmulate", false, false)
+
+ModulePass *llvm::createGenXEmulatePass() {
+  initializeGenXEmulatePass(*PassRegistry::getPassRegistry());
+  return new GenXEmulate;
+}
+
+void GenXEmulate::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+bool GenXEmulate ::runOnModule(Module &M) {
+  bool Changed = false;
+  EmulationFuns.clear();
+  if (auto P = getAnalysisIfAvailable<GenXSubtargetPass>())
+    ST = P->getSubtarget();
+
+  // Process non-builtin functions.
+  for (auto &F : M.getFunctionList()) {
+    if (!isEmulationFunction(&F))
+      Changed |= runOnFunction(F);
+  }
+
+  // Delete unuse builtins or make used builtins internal.
+  for (auto I = M.begin(); I != M.end();) {
+    Function &F = *I++;
+    if (isEmulationFunction(&F)) {
+      Changed = true;
+      if (F.use_empty())
+        F.eraseFromParent();
+      else
+        F.setLinkage(GlobalValue::InternalLinkage);
+    }
+  }
+
+  return Changed;
+}
+
+bool GenXEmulate::runOnFunction(Function &F) {
+  bool Changed = false;
+  for (auto &BB : F.getBasicBlockList()) {
+    for (auto I = BB.begin(); I != BB.end();) {
+      Instruction *Inst = &*I++;
+      Changed |= emulateInst(Inst);
+    }
+  }
+  return Changed;
+}
+
+Function *GenXEmulate::getEmulationFunction(Instruction *Inst) {
+  unsigned Opcode = Inst->getOpcode();
+  Type *Ty = Inst->getType();
+  OpType OpAndType = std::make_pair(Opcode, Ty);
+
+  // Check if this emulation function has been cached.
+  auto Iter = EmulationFuns.find(OpAndType);
+  if (Iter != EmulationFuns.end())
+    return Iter->second;
+
+  assert(ST && "subtarget expected");
+  StringRef EmuFnName = ST->getEmulateFunction(Inst);
+  if (EmuFnName.empty())
+    return nullptr;
+
+  Module *M = Inst->getParent()->getParent()->getParent();
+  for (auto &F : M->getFunctionList()) {
+    if (!isEmulationFunction(&F))
+      continue;
+    if (F.getReturnType() != Inst->getType())
+      continue;
+    StringRef FnName = F.getName();
+    if (FnName.contains(EmuFnName)) {
+      EmulationFuns[OpAndType] = &F;
+      return &F;
+    }
+  }
+
+  return nullptr;
+}
+
+bool GenXEmulate::emulateInst(Instruction *Inst) {
+  Function *EmuFn = getEmulationFunction(Inst);
+  if (!EmuFn)
+    return false;
+
+  assert(!isa<CallInst>(Inst) && "call emulation not supported yet");
+  IRBuilder<> Builder(Inst);
+  SmallVector<Value *, 8> Args(Inst->operands());
+  Value *EmuInst = Builder.CreateCall(EmuFn, Args);
+  Inst->replaceAllUsesWith(EmuInst);
+  Inst->eraseFromParent();
+  return true;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXExtractVectorizer.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXExtractVectorizer.cpp
new file mode 100644
index 000000000000..32f2cf3bfa02
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXExtractVectorizer.cpp
@@ -0,0 +1,295 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXExtractVectorizer
+/// ---------------------
+///
+/// GenX extract vectorizer pass is stage 1 of the histogram optimization: if
+/// there are multiple scalar rdregions from the same vector, all subject
+/// to the same binary operator with constant rhs or the same trunc/zext/sext,
+/// then they are combined into a vector version of the binary operator or
+/// trunc/zext/sext, with scalar rdregions from the result of that. This is
+/// designed to handle any trunc/zext/sext then scale of the index in the
+/// histogram optimization, although it does also apply in a few other cases.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_ExtractVectorizer"
+
+#include "GenX.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// GenX extract vectorizer pass
+class GenXExtractVectorizer : public FunctionPass {
+  bool Modified = false;
+  DominatorTree *DT = nullptr;
+  SmallVector<Value *, 8> Extracted;
+  std::set<Value *> ExtractedSet;
+  struct Extract {
+    Instruction *Inst; // the binary operator applied to the extracted element
+    int Offset; // constant offset from the rdregion
+    Extract(Instruction *Inst, int Offset) : Inst(Inst), Offset(Offset) {}
+    // Sort in offset order
+    bool operator<(const Extract &Other) const { return Offset < Other.Offset; }
+  };
+  struct BucketIndex {
+    unsigned Opcode;
+    Type *CastTo;
+    Value *Indirect;
+    Type *ConvTy;
+    BucketIndex(unsigned Opcode, Type *CastTo, Value *Indirect, Type *ConvTy)
+        : Opcode(Opcode), CastTo(CastTo), Indirect(Indirect), ConvTy(ConvTy) {}
+    bool operator<(const BucketIndex &Other) const {
+      if (Opcode != Other.Opcode)
+        return Opcode < Other.Opcode;
+      if (CastTo != Other.CastTo)
+        return CastTo < Other.CastTo;
+      return Indirect < Other.Indirect;
+    }
+  };
+public:
+  static char ID;
+  explicit GenXExtractVectorizer() : FunctionPass(ID) { }
+  virtual StringRef getPassName() const { return "GenX Extract Vectorizer"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
+  }
+  bool runOnFunction(Function &F);
+private:
+  void processExtracted(Value *V);
+  void processBucket(const BucketIndex *BIdx, SmallVectorImpl<Extract> *B);
+};
+
+}// end namespace llvm
+
+
+char GenXExtractVectorizer::ID = 0;
+namespace llvm { void initializeGenXExtractVectorizerPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXExtractVectorizer, "GenXExtractVectorizer",
+                      "GenXExtractVectorizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GenXExtractVectorizer, "GenXExtractVectorizer",
+                    "GenXExtractVectorizer", false, false)
+
+// Publicly exposed interface to pass...
+FunctionPass *llvm::createGenXExtractVectorizerPass()
+{
+  initializeGenXExtractVectorizerPass(*PassRegistry::getPassRegistry());
+  return new GenXExtractVectorizer();
+}
+
+/***********************************************************************
+ * runOnFunction : run the extract vectorizer for this Function
+ */
+bool GenXExtractVectorizer::runOnFunction(Function &F)
+{
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  // Scan the code looking for vector values that have an extract (a rdregion
+  // of one element) applied.
+  for (auto fi = F.begin(), fe = F.end(); fi != fe; ++fi) {
+    BasicBlock *BB = &*fi;
+    for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi) {
+      Instruction *Inst = &*bi;
+      if (!GenXIntrinsic::isRdRegion(Inst))
+        continue;
+      if (isa<VectorType>(Inst->getType()))
+        continue;
+      Value *V = Inst->getOperand(0);
+      if (isa<Constant>(V))
+        continue;
+      if (ExtractedSet.insert(V).second)
+        Extracted.push_back(V);
+    }
+  }
+  ExtractedSet.clear();
+  // Process each such vector. Processing a vector might result in another
+  // new vector being pushed onto Extracted, so that in turn will be processed.
+  while (!Extracted.empty()) {
+    Value *V = Extracted.back();
+    Extracted.pop_back();
+    processExtracted(V);
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * GenXExtractVectorizer::processExtracted : process an instruction or arg that
+ * has at least one scalar extracted from it (using rdregion), in the hope that
+ * we can vectorize it as the first stage of the histogram optimization
+ */
+void GenXExtractVectorizer::processExtracted(Value *V)
+{
+  // Gather the scalar extracting rdregion uses of V into buckets, one for
+  // each binaryoperator with constant rhs that the extracted value is used in.
+  std::map<BucketIndex, SmallVector<Extract, 4>> Buckets;
+  for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) {
+    auto user = cast<Instruction>(ui->getUser());
+    if (!GenXIntrinsic::isRdRegion(user))
+      continue; // not rdregion
+    if (isa<VectorType>(user->getType()))
+      continue; // not rdregion with scalar result
+    if (!user->hasOneUse())
+      continue; // rdregion not single use
+    auto Use2 = &*user->use_begin();
+    auto User2 = cast<Instruction>(Use2->getUser());
+    // We want User2 to be either a binary operator with constant rhs,
+    // or a trunc/zext/sext.
+    Type *CastTo = nullptr;
+    if (isa<BinaryOperator>(User2)) {
+      if (!isa<Constant>(User2->getOperand(1)))
+        continue; // binary operator has non-constant rhs
+    } else {
+      if (!isa<CastInst>(User2) || isa<BitCastInst>(User2))
+        continue; // not trunc/zext/sext
+      CastTo = User2->getType();
+    }
+    // Get the index, possibly as index+offset if the index is a balable add
+    // instruction.
+    Region R = Region::getWithOffset(user);
+    // Add to the bucket. The bucket is indexed by:
+    //  - the opcode of the binaryoperator or trunc/zext/sext using the
+    //    extracted value
+    //  - the type being trunc/zext/sext to
+    //  - any variable part of the rdregion index
+    // The Extract pushed into the bucket contains:
+    //  - the binaryoperator itself (from which we can find the rdregion)
+    //  - the constant offset part of the rdregion index.
+    Buckets[BucketIndex(User2->getOpcode(), CastTo, R.Indirect, User2->getType())]
+        .push_back(Extract(User2, R.Offset));
+  }
+  // Now look at each bucket. Only bother with a bucket that has at least four
+  // scalar extracts in it.
+  for (auto i = Buckets.begin(), e = Buckets.end(); i != e; ++i) {
+    auto Bucket = &i->second;
+    if (Bucket->size() < 4)
+      continue;
+    processBucket(&i->first, Bucket);
+  }
+}
+
+/***********************************************************************
+ * GenXExtractVectorizer::processBucket : process one bucket of extracts from
+ * the same vector
+ *
+ * The bucket contains at least 4 instances of a binary operator whose rhs
+ * is constant and whose lhs is an extract (a scalar rdregion) from the same
+ * vector. Either each index is constant, or each index is an add with constant
+ * rhs and with the same lhs.
+ */
+void GenXExtractVectorizer::processBucket(const BucketIndex *BIdx,
+    SmallVectorImpl<Extract> *B)
+{
+  // Sort the extracts into offset order.
+  std::sort(B->begin(), B->end());
+  // See if we have a sequence of offsets such that we can construct a
+  // 1D region.
+  int Diff = (*B)[1].Offset - (*B)[0].Offset;
+  for (unsigned j = 1, je = B->size() - 1; j != je; ++j)
+    if ((*B)[j + 1].Offset - (*B)[j].Offset != Diff)
+      return;
+  // Find the latest point that we can insert the vectorized instruction.
+  SmallVector<Instruction *, 8> Insts;
+  for (auto j = B->begin(), je = B->end(); j != je; ++j)
+    Insts.push_back(j->Inst);
+  auto InsertBefore = findClosestCommonDominator(DT, Insts);
+  // Create the new rdregion.
+  Extract *Extract0 = &(*B)[0];
+  Region R(Extract0->Inst->getOperand(0));
+  R.NumElements = R.Width = B->size();
+  R.Stride = Diff / R.ElementBytes;
+  R.Indirect = BIdx->Indirect;
+  R.Offset = Extract0->Offset;
+  Value *OrigVector = cast<Instruction>(Extract0->Inst->getOperand(0))
+      ->getOperand(0);
+  Value *NewRdRegion = OrigVector;
+  // Need to splat if Diff is 0, otherwise elements extracted are wrong.
+  if (Diff == 0 || R.Indirect || R.Offset ||
+      R.NumElements !=
+          cast<VectorType>(OrigVector->getType())->getNumElements()) {
+    // Not identity region.
+    NewRdRegion = R.createRdRegion(OrigVector,
+        Extract0->Inst->getName() + ".histogrammed", InsertBefore,
+        Extract0->Inst->getDebugLoc(), /*AllowScalar=*/false);
+  }
+  // Create the vectorized binary operator or trunc/zext/sext.
+  Instruction *NewInst = nullptr;
+  if (isa<BinaryOperator>(Extract0->Inst)) {
+    // Create a vector of the constants used in the right side of the binary
+    // operators.
+    SmallVector<Constant *, 8> RhsConsts;
+    for (auto j = B->begin(), je = B->end(); j != je; ++j)
+      RhsConsts.push_back(cast<Constant>(j->Inst->getOperand(1)));
+    auto CV = ConstantVector::get(RhsConsts);
+    NewInst = BinaryOperator::Create(
+        (Instruction::BinaryOps)Extract0->Inst->getOpcode(), NewRdRegion, CV,
+        Extract0->Inst->getName() + ".histogrammed", InsertBefore);
+  } else {
+    // Create the vectorized trunc/zext/sext.
+    auto VT = VectorType::get(Extract0->Inst->getType(), B->size());
+    NewInst = CastInst::Create((Instruction::CastOps)Extract0->Inst->getOpcode(),
+        NewRdRegion, VT,
+        Extract0->Inst->getName() + ".histogrammed", InsertBefore);
+  }
+  NewInst->setDebugLoc(Extract0->Inst->getDebugLoc());
+  // For each original scalar binary operator or cast, create a rdregion to
+  // extract the equivalent scalar from the result of the vectorized binary
+  // operator, and use it to replace uses of the original binary operator.
+  for (auto j = B->begin(), je = B->end(); j != je; ++j) {
+    Region R2(NewInst);
+    R2.NumElements = R2.Width = 1;
+    R2.Offset = (j - B->begin()) * R2.ElementBytes;
+    auto NewRdRegion2 = R2.createRdRegion(NewInst, "",
+        InsertBefore, j->Inst->getDebugLoc(), /*AllowScalar=*/true);
+    NewRdRegion2->takeName(j->Inst);
+    j->Inst->replaceAllUsesWith(NewRdRegion2);
+  }
+  for (auto j = B->begin(), je = B->end(); j != je; ++j) {
+    auto OldRdRegion = cast<Instruction>(j->Inst->getOperand(0));
+    j->Inst->eraseFromParent();
+    OldRdRegion->eraseFromParent();
+  }
+  // Add the new vectorized binary operator or cast back into
+  // ExtractVectorizer so the extracts we added could in turn be vectorized.
+  Extracted.push_back(NewInst);
+  Modified = true;
+}
+
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXFuncPtrsLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXFuncPtrsLowering.cpp
new file mode 100644
index 000000000000..9558b6fddb37
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXFuncPtrsLowering.cpp
@@ -0,0 +1,364 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// This pass lowers all function pointers related instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "GenX.h"
+#include "GenXIntrinsics.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvmWrapper/IR/InstrTypes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/GenXIntrinsics/GenXMetadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// Function pointers lowering consists of two stages:
+// 1. Collect all instruction that use function pointers and their users that
+// have to be modified
+// 2. Actually modify the instructions collected:
+//    - reconstruct existing wrr/rdr instrinsics (remove internal casts, use i64
+//    types)
+//    - create new wrr/rdrs where necessary, e.g. as a select args for further
+//    baling to succeed
+//    - reconstruct all funcptrs-related phis
+//    - update all users of the instruction modified (may insert additional
+//    casts where necessary,
+//                                                    e.g. ptrtoint for wrr to
+//                                                    indirect call)
+
+class GenXFunctionPointersLowering : public ModulePass {
+  SetVector<Instruction *> InstToProcess;
+  std::map<PHINode *, unsigned> PhisIndex;
+
+  const DataLayout *DL;
+  LLVMContext *Ctx;
+
+  bool IsFuncPointerVec(Value *V, SetVector<Function *> *Funcs = nullptr);
+
+  void collectFuncUsers(User *U);
+  void collectFuncUsers(IGCLLVM::CallInst *CI);
+  void collectFuncUsers(PHINode *Phi);
+  void collectFuncUsers(CastInst *Phi);
+  void collectFuncUsers(SelectInst *SI);
+
+  void reconstructCall(CallInst *CI);
+  void reconstructPhi(PHINode *Phi);
+  void reconstructSelect(SelectInst *SI);
+
+  void replaceAllUsersCommon(Instruction *Old, Instruction *New);
+
+  Value *transformFuncPtrVec(Value *V);
+
+public:
+  static char ID;
+  explicit GenXFunctionPointersLowering();
+  StringRef getPassName() const override {
+    return "GenX function pointers lowering";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char GenXFunctionPointersLowering::ID = 0;
+namespace llvm {
+void initializeGenXFunctionPointersLoweringPass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXFunctionPointersLowering,
+                      "GenXFunctionPointersLowering",
+                      "GenXFunctionPointersLowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis)
+INITIALIZE_PASS_DEPENDENCY(GenXModule)
+INITIALIZE_PASS_END(GenXFunctionPointersLowering,
+                    "GenXFunctionPointersLowering",
+                    "GenXFunctionPointersLowering", false, false)
+
+GenXFunctionPointersLowering::GenXFunctionPointersLowering() : ModulePass(ID) {
+  initializeGenXFunctionPointersLoweringPass(*PassRegistry::getPassRegistry());
+}
+
+ModulePass *llvm::createGenXFunctionPointersLoweringPass() {
+  return new GenXFunctionPointersLowering();
+}
+
+void GenXFunctionPointersLowering::getAnalysisUsage(AnalysisUsage &AU) const {
+  ModulePass::getAnalysisUsage(AU);
+  AU.addRequired<FunctionGroupAnalysis>();
+  AU.addRequired<GenXModule>();
+  AU.setPreservesCFG();
+}
+
+bool GenXFunctionPointersLowering::runOnModule(Module &M) {
+  bool Modified = false;
+
+  for (auto &F : M)
+    if (F.hasAddressTaken()) {
+      F.addFnAttr(genx::FunctionMD::CMStackCall);
+      F.addFnAttr(genx::FunctionMD::ReferencedIndirectly);
+    }
+
+  for (auto &F : M)
+    if (F.hasFnAttribute("referenced-indirectly"))
+      for (auto *U : F.users())
+        collectFuncUsers(U);
+
+  Ctx = &M.getContext();
+  DL = &M.getDataLayout();
+  for (auto *TI : InstToProcess) {
+    if (auto *Phi = dyn_cast<PHINode>(TI))
+      reconstructPhi(Phi);
+    else if (auto *CI = dyn_cast<CallInst>(TI))
+      reconstructCall(CI);
+    else if (auto *SI = dyn_cast<SelectInst>(TI))
+      reconstructSelect(SI);
+    else
+      assert(0 && "Unsupported instruction to process");
+  }
+  return Modified;
+}
+
+void GenXFunctionPointersLowering::collectFuncUsers(User *U) {
+  if (auto *CI = dyn_cast<CallInst>(U))
+    collectFuncUsers(CI);
+  else if (auto *C = dyn_cast<CastInst>(U))
+    collectFuncUsers(C);
+  else if (auto *Phi = dyn_cast<PHINode>(U))
+    collectFuncUsers(Phi);
+  else if (auto *SI = dyn_cast<SelectInst>(U))
+    collectFuncUsers(SI);
+  else if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
+    // skip
+  } else if (auto *EE = dyn_cast<ExtractElementInst>(U)) {
+    collectFuncUsers(EE);
+  } else if (isa<Constant>(U))
+    for (auto *UU : U->users())
+      collectFuncUsers(UU);
+  else {
+    assert(0 && "unsupported funcptr user");
+  }
+}
+
+void GenXFunctionPointersLowering::collectFuncUsers(IGCLLVM::CallInst *CI) {
+  if (!CI->isIndirectCall() &&
+      (GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+           GenXIntrinsic::genx_rdregioni ||
+       GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+           GenXIntrinsic::genx_wrregioni)) {
+    InstToProcess.insert(CI);
+
+    for (auto *U : CI->users())
+      collectFuncUsers(U);
+  }
+}
+
+// do not process bitcast itself, after our transformations
+// it should become dead and will be swept
+void GenXFunctionPointersLowering::collectFuncUsers(CastInst *BC) {
+  for (auto *U : BC->users())
+    collectFuncUsers(U);
+}
+
+void GenXFunctionPointersLowering::collectFuncUsers(PHINode *Phi) {
+  InstToProcess.insert(Phi);
+
+  for (auto *U : Phi->users())
+    collectFuncUsers(U);
+}
+
+void GenXFunctionPointersLowering::collectFuncUsers(SelectInst *SI) {
+  InstToProcess.insert(SI);
+
+  if (!SI->getType()->getScalarType()->isIntegerTy(64))
+    for (auto *U : SI->users())
+      collectFuncUsers(U);
+}
+
+void GenXFunctionPointersLowering::replaceAllUsersCommon(Instruction *Old,
+                                                         Instruction *New) {
+  while (!Old->use_empty()) {
+    auto *U = Old->user_back();
+    if (auto *CIU = dyn_cast<IGCLLVM::CallInst>(U)) {
+      if (CIU->getCalledOperand() == Old) {
+        auto *IntToPtr = CastInst::CreateBitOrPointerCast(
+            New, CIU->getCalledOperand()->getType(), "", CIU);
+        CIU->replaceUsesOfWith(Old, IntToPtr);
+      } else if (GenXIntrinsic::getAnyIntrinsicID(CIU->getCalledFunction()) ==
+                     GenXIntrinsic::genx_rdregioni ||
+                 GenXIntrinsic::getAnyIntrinsicID(CIU->getCalledFunction()) ==
+                     GenXIntrinsic::genx_wrregioni ||
+                 CIU->getCalledOperand() != Old) {
+        CIU->replaceUsesOfWith(Old, New);
+      } else
+        assert(0 && "unsupported call of a function pointer");
+    } else if (isa<IntToPtrInst>(U) || isa<ICmpInst>(U)) {
+      U->replaceUsesOfWith(Old, New);
+    } else if (auto *Phi = dyn_cast<PHINode>(U)) {
+      Phi->replaceUsesOfWith(Old, New);
+      PhisIndex[Phi]++;
+    } else {
+      assert(0 && "Unsupported function pointer user\n");
+    }
+  }
+  Old->eraseFromParent();
+}
+
+void GenXFunctionPointersLowering::reconstructCall(CallInst *CI) {
+  assert(GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+             GenXIntrinsic::genx_rdregioni ||
+         GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+             GenXIntrinsic::genx_wrregioni);
+  Region R(Type::getInt64Ty(*Ctx));
+  unsigned OffIdx = GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+                            GenXIntrinsic::genx_rdregioni
+                        ? 4
+                        : 5;
+  if (!isa<Constant>(CI->getOperand(OffIdx)))
+    R.Indirect = CI->getOperand(OffIdx);
+  else
+    R.Offset = cast<ConstantInt>(CI->getOperand(OffIdx))->getZExtValue();
+  Instruction *Result = nullptr;
+  if (GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+      GenXIntrinsic::genx_rdregioni) {
+    Result = cast<Instruction>(
+        R.createRdRegion(transformFuncPtrVec(CI->getOperand(0)), CI->getName(),
+                         CI, CI->getDebugLoc(), true));
+  } else if (GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+             GenXIntrinsic::genx_wrregioni)
+    Result = cast<Instruction>(
+        R.createWrRegion(transformFuncPtrVec(CI->getOperand(0)),
+                         transformFuncPtrVec(CI->getOperand(1)), CI->getName(),
+                         CI, CI->getDebugLoc()));
+  if (Result->getType() == CI->getType())
+    return;
+  replaceAllUsersCommon(CI, Result);
+}
+
+void GenXFunctionPointersLowering::reconstructPhi(PHINode *Phi) {
+  for (auto *Op : Phi->operand_values()) {
+    auto *OpTr = transformFuncPtrVec(Op);
+    Phi->replaceUsesOfWith(Op, OpTr);
+    if (OpTr != Op)
+      PhisIndex[Phi]++;
+  }
+  assert(Phi->getNumOperands() > 0 && Phi->getNumOperands() == PhisIndex[Phi]);
+  Type *NewTy = Phi->value_op_begin()->getType();
+  assert(std::all_of(Phi->value_op_begin(), Phi->value_op_end(),
+                     [&NewTy](Value *V) { return V->getType() == NewTy; }));
+  auto *NewPhi = PHINode::Create(NewTy, 0, Phi->getName(), Phi);
+  for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i)
+    NewPhi->addIncoming(Phi->getIncomingValue(i), Phi->getIncomingBlock(i));
+  while (!Phi->user_empty()) {
+    // already checked that this is only wrr/rdr
+    auto *U = Phi->user_back();
+    U->replaceUsesOfWith(Phi, NewPhi);
+  }
+  Phi->eraseFromParent();
+}
+
+void GenXFunctionPointersLowering::reconstructSelect(SelectInst *SI) {
+  Value *TVal = nullptr, *FVal = nullptr;
+  Region R1(SI->getTrueValue()->getType(), DL),
+      R2(SI->getFalseValue()->getType(), DL);
+  auto *BCT = BitCastInst::CreateBitOrPointerCast(
+      transformFuncPtrVec(SI->getTrueValue()), Type::getInt64Ty(*Ctx), "", SI);
+  BCT = BitCastInst::CreateBitOrPointerCast(
+      BCT, VectorType::get(Type::getInt64Ty(*Ctx), 1), "", SI);
+  auto *BCF = BitCastInst::CreateBitOrPointerCast(
+      transformFuncPtrVec(SI->getFalseValue()), Type::getInt64Ty(*Ctx), "", SI);
+  BCF = BitCastInst::CreateBitOrPointerCast(
+      BCF, VectorType::get(Type::getInt64Ty(*Ctx), 1), "", SI);
+  R1.Width = (SI->getTrueValue()->getType()->isVectorTy())
+                 ? SI->getTrueValue()->getType()->getVectorNumElements()
+                 : 1;
+  R1.Width = (SI->getFalseValue()->getType()->isVectorTy())
+                 ? SI->getFalseValue()->getType()->getVectorNumElements()
+                 : 1;
+  R1.Stride = 0, R1.VStride = 0;
+  R2.Stride = 0, R2.VStride = 0;
+  TVal = R1.createRdRegion(BCT, SI->getName(), SI, SI->getDebugLoc(), true);
+  FVal = R2.createRdRegion(BCF, SI->getName(), SI, SI->getDebugLoc(), true);
+  auto *NewSI = SelectInst::Create(SI->getCondition(), TVal, FVal, "", SI);
+  if (SI->getType() == NewSI->getType())
+    SI->replaceAllUsesWith(NewSI);
+  else
+    replaceAllUsersCommon(SI, NewSI);
+}
+
+Value *GenXFunctionPointersLowering::transformFuncPtrVec(Value *V) {
+  // quite often wrr/rdr get bitcast of funcptrs to <N * i8> as input,
+  // here we simply don't need them and DCE will sweep them later
+  auto Int64Ty = Type::getInt64Ty(*Ctx);
+  if (isa<UndefValue>(V)) {
+    assert(V->getType()->isSingleValueType());
+    if (V->getType()->getScalarType()->isIntegerTy(64))
+      return V;
+    else if (V->getType()->isVectorTy())
+      return UndefValue::get(
+          VectorType::get(Int64Ty, V->getType()->getVectorNumElements()));
+    else
+      return UndefValue::get(Int64Ty);
+  }
+  if (isa<ConstantExpr>(V) &&
+      cast<ConstantExpr>(V)->getOpcode() == Instruction::BitCast)
+    V = cast<ConstantExpr>(V)->getOperand(0);
+  else if (auto *BC = dyn_cast<BitCastInst>(V))
+    if (!(BC->getType()->isVectorTy() &&
+          BC->getType()->getScalarType() == BC->getOperand(0)->getType()))
+      V = BC->getOperand(0);
+  SetVector<Function *> Funcs;
+  if (!isFuncPointerVec(V, &Funcs))
+    return V;
+  assert(Funcs.size() > 0);
+
+  assert(V->getType()->isVectorTy());
+  std::vector<Constant *> CF;
+  for (auto &Val : Funcs)
+    CF.push_back(ConstantExpr::getPtrToInt(cast<Constant>(Val), Int64Ty));
+  Value *NewVal = nullptr;
+  // generate i64 instead of <1 x i64>
+  if (CF.size() > 1)
+    NewVal = ConstantVector::get(CF);
+  else if (CF.size() == 1)
+    NewVal = CF.front();
+  return NewVal;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXGEPLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGEPLowering.cpp
new file mode 100644
index 000000000000..36a2a2b8475b
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGEPLowering.cpp
@@ -0,0 +1,324 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+/// GenXGEPLowering
+/// ---------------
+///
+/// GenXGEPLowering is a function pass that lowers GEP instructions into
+/// primitive ones that the rest of the GenX backend can deal with.
+///
+//===----------------------------------------------------------------------===//
+
+#include "GenX.h"
+#include "GenXModule.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+class GenXGEPLowering : public FunctionPass {
+  const DataLayout *DL = nullptr;
+  LoopInfo *LI = nullptr;
+  IRBuilder<> *Builder = nullptr;
+
+public:
+  static char ID;
+
+  GenXGEPLowering() : FunctionPass(ID) {}
+
+  virtual StringRef getPassName() const override { return "GenX GEP Lowering"; }
+
+  virtual bool runOnFunction(Function &F) override;
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+    AU.addPreserved<GenXModule>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+  }
+
+private:
+  bool lowerGetElementPtrInst(GetElementPtrInst *GEP,
+                              BasicBlock::iterator &BBI) const;
+  Value *truncExpr(Value *Val, Type *NewTy) const;
+  Value *getSExtOrTrunc(Value *, Type *) const;
+};
+
+} // namespace
+
+char GenXGEPLowering::ID = 0;
+namespace llvm { void initializeGenXGEPLoweringPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXGEPLowering, "GenXGEPLowering", "GenXGEPLowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(GenXGEPLowering, "GenXGEPLowering", "GenXGEPLowering", false, false)
+
+FunctionPass *llvm::createGenXGEPLoweringPass() {
+  initializeGenXGEPLoweringPass(*PassRegistry::getPassRegistry());
+  return new GenXGEPLowering;
+}
+
+bool GenXGEPLowering::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DL = &F.getParent()->getDataLayout();
+
+  const TargetTransformInfo &TTI =
+    getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  auto FlatAddrSpace = TTI.getFlatAddressSpace();
+
+  assert(DL && "null datalayout");
+#if 0
+  // a good place to fix block layout
+  if (LI->empty())
+    LayoutBlocks(F);
+  else
+    LayoutBlocks(F, *LI);
+#endif
+  IRBuilder<> TheBuilder(F.getContext());
+  Builder = &TheBuilder;
+
+  bool Changed = false;
+  for (auto &BB : F) {
+    for (auto BI = BB.begin(), BE = BB.end(); BI != BE;) {
+      Instruction *Inst = &(*BI++);
+      Builder->SetInsertPoint(Inst);
+
+      switch (Inst->getOpcode()) {
+      default: // By default, DO NOTHING
+        break;
+      case Instruction::GetElementPtr:
+        Changed |= lowerGetElementPtrInst(cast<GetElementPtrInst>(Inst), BI);
+        break;
+      case Instruction::PtrToInt:
+        auto PtrV = cast<PtrToIntInst>(Inst)->getPointerOperand();
+        auto AddrSpace = cast<PtrToIntInst>(Inst)->getPointerAddressSpace();
+        if (AddrSpace == FlatAddrSpace) {
+          if (auto PtrCast = dyn_cast<AddrSpaceCastInst>(PtrV)) {
+            // this is no-op AddrSpaceCast, should be removed
+            // create a new PtrToInt from the original pointer
+            // bypass the AddrSpaceCast and PtrToInt
+            auto P2I = Builder->CreatePtrToInt(PtrCast->getOperand(0), Inst->getType());
+            Inst->replaceAllUsesWith(P2I);
+            Inst->eraseFromParent();
+            if (PtrCast->use_empty()) {
+              PtrCast->eraseFromParent();
+            }
+          }
+        }
+        break;
+      }
+    }
+  }
+  Builder = nullptr;
+
+  return Changed;
+}
+
+bool GenXGEPLowering::lowerGetElementPtrInst(GetElementPtrInst *GEP,
+                                             BasicBlock::iterator &BBI) const {
+  assert(Builder);
+  Value *PtrOp = GEP->getPointerOperand();
+  PointerType *PtrTy = dyn_cast<PointerType>(PtrOp->getType());
+  assert(PtrTy && "Only accept scalar pointer!");
+
+  unsigned PtrSizeInBits = DL->getPointerSizeInBits(PtrTy->getAddressSpace());
+  unsigned PtrMathSizeInBits = PtrSizeInBits;
+  auto IntPtrTy = IntegerType::get(Builder->getContext(), PtrSizeInBits);
+  auto PtrMathTy = IntegerType::get(Builder->getContext(), PtrMathSizeInBits);
+
+  // Check if the pointer itself is created from IntToPtr. If it is, and if
+  // the int is the same size, we can use the int directly. Otherwise, we
+  // need to add PtrToInt.
+  Value *BasePointer = nullptr;
+  if (IntToPtrInst *I2PI = dyn_cast<IntToPtrInst>(PtrOp)) {
+    Value *IntOp = I2PI->getOperand(0);
+    if (IntOp->getType() == IntPtrTy)
+      BasePointer = IntOp;
+  }
+  if (!BasePointer)
+    BasePointer = Builder->CreatePtrToInt(PtrOp, IntPtrTy);
+
+  // This is the value of the pointer, which will ultimately replace gep.
+  Value *PointerValue = BasePointer;
+
+  Type *Ty = PtrTy;
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (auto OI = GEP->op_begin() + 1, E = GEP->op_end(); OI != E; ++OI, ++GTI) {
+    Value *Idx = *OI;
+    if (StructType *StTy = GTI.getStructTypeOrNull()) {
+      unsigned Field = unsigned(cast<ConstantInt>(Idx)->getZExtValue());
+      if (Field) {
+        uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
+        Value *OffsetVal = Builder->getInt(APInt(PtrMathSizeInBits, Offset));
+        PointerValue = Builder->CreateAdd(PointerValue, OffsetVal);
+      }
+      Ty = StTy->getElementType(Field);
+    } else {
+      Ty = GTI.getIndexedType();
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (!CI->isZero()) {
+          uint64_t Offset = DL->getTypeAllocSize(Ty) * CI->getSExtValue();
+          Value *OffsetVal = Builder->getInt(APInt(PtrMathSizeInBits, Offset));
+          PointerValue = Builder->CreateAdd(PointerValue, OffsetVal);
+        }
+      } else {
+        Value *NewIdx = getSExtOrTrunc(Idx, PtrMathTy);
+        APInt ElementSize = APInt(PtrMathSizeInBits, DL->getTypeAllocSize(Ty));
+
+        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(NewIdx)) {
+          // Detect the pattern GEP base, a + b where base and a are both loop
+          // invariant (but not b), so we could rearrange the lowered code into
+          // (base + (a << shftAmt)) + (b << shftAmt).
+          Loop *L = LI ? LI->getLoopFor(BO->getParent()) : nullptr;
+          if (L && L->isLoopInvariant(PtrOp) &&
+              BO->getOpcode() == Instruction::Add) {
+
+            auto reassociate = [&](Value *A, Value *B) {
+              Value *InvVal = nullptr;
+              if (ElementSize == 1)
+                InvVal = A;
+              else if (ElementSize.isPowerOf2())
+                InvVal = Builder->CreateShl(
+                    A, APInt(PtrMathSizeInBits, ElementSize.logBase2()));
+              else
+                InvVal = Builder->CreateMul(A, Builder->getInt(ElementSize));
+              PointerValue = Builder->CreateAdd(PointerValue, InvVal);
+              NewIdx = B;
+            };
+
+            Value *LHS = BO->getOperand(0);
+            Value *RHS = BO->getOperand(1);
+            bool isLHSLI = L->isLoopInvariant(LHS);
+            bool isRHSLI = L->isLoopInvariant(RHS);
+            if (isLHSLI && !isRHSLI)
+              reassociate(LHS, RHS);
+            else if (!isLHSLI && isRHSLI)
+              reassociate(RHS, LHS);
+          }
+        }
+        if (ElementSize == 1) {
+          // DO NOTHING.
+        } else if (ElementSize.isPowerOf2()) {
+          APInt ShiftAmount = APInt(PtrMathSizeInBits, ElementSize.logBase2());
+          NewIdx = Builder->CreateShl(NewIdx, ShiftAmount);
+        } else
+          NewIdx = Builder->CreateMul(NewIdx, Builder->getInt(ElementSize));
+
+        PointerValue = Builder->CreateAdd(PointerValue, NewIdx);
+      }
+    }
+  }
+
+  PointerValue = Builder->CreateIntToPtr(PointerValue, GEP->getType());
+  GEP->replaceAllUsesWith(PointerValue);
+  GEP->eraseFromParent();
+  if (Instruction *I = dyn_cast<Instruction>(PointerValue)) {
+    BBI = BasicBlock::iterator(I);
+    ++BBI;
+  }
+
+  return true;
+}
+
+Value *GenXGEPLowering::getSExtOrTrunc(Value *Val, Type *NewTy) const {
+  assert(Builder);
+  Type *OldTy = Val->getType();
+  unsigned OldWidth = OldTy->getIntegerBitWidth();
+  unsigned NewWidth = NewTy->getIntegerBitWidth();
+
+  if (OldWidth < NewWidth) // SExt
+    return Builder->CreateSExt(Val, NewTy);
+  if (OldWidth > NewWidth) // Trunc
+    return truncExpr(Val, NewTy);
+  return Val;
+}
+
+Value *GenXGEPLowering::truncExpr(Value *Val, Type *NewTy) const {
+  assert(Builder);
+  // Truncation on Gen could be as cheap as NOP by creating proper regions.
+  // Instead of truncating the value itself, truncate how it's calculated.
+  if (Constant *C = dyn_cast<Constant>(Val))
+    return Builder->CreateIntCast(C, NewTy, false);
+
+  if (!isa<Instruction>(Val))
+    return Builder->CreateTrunc(Val, NewTy);
+
+  Instruction *I = cast<Instruction>(Val);
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    BinaryOperator *BO = cast<BinaryOperator>(I);
+    Value *LHS = truncExpr(BO->getOperand(0), NewTy);
+    Value *RHS = truncExpr(BO->getOperand(1), NewTy);
+    return Builder->CreateBinOp(BO->getOpcode(), LHS, RHS);
+  }
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt: {
+    Value *Opnd = I->getOperand(0);
+    if (Opnd->getType() == NewTy)
+      return Opnd;
+    return Builder->CreateIntCast(Opnd, NewTy, Opc == Instruction::SExt);
+  }
+  case Instruction::Select: {
+    Value *TVal = truncExpr(I->getOperand(1), NewTy);
+    Value *FVal = truncExpr(I->getOperand(2), NewTy);
+    return Builder->CreateSelect(I->getOperand(0), TVal, FVal);
+  }
+#if 0
+  // TODO: Rewrite truncExpr into iterative one instead of recursive one to
+  // easily found the loop due to phi-node.
+  case Instruction::PHI: {
+    PHINode *PN = cast<PHINode>(I);
+    PHINode *Res = PHINode::Create(NewTy, PN->getNumIncomingValues());
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *V = truncExpr(PN->getIncomingValue(i), NewTy);
+      Res->addIncoming(V, PN->getIncomingBlock(i));
+    }
+    return Res;
+  }
+#endif
+  default:
+    // Don't know truncate its calculation safely, fall back to the regular way.
+    break;
+  }
+
+  return Builder->CreateTrunc(Val, NewTy);
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.cpp
new file mode 100644
index 000000000000..388b3297a293
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.cpp
@@ -0,0 +1,332 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// Utility functions relating to SIMD CF goto/join.
+//
+//===----------------------------------------------------------------------===//
+#include "GenXGotoJoin.h"
+#include "GenX.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace llvm;
+using namespace genx;
+
+/***********************************************************************
+ * isEMValue : detect whether a value is an EM (execution mask)
+ *
+ * It is an EM value if it is an extractvalue instruction extracting element
+ * 0 from the struct returned by goto/join.
+ */
+bool GotoJoin::isEMValue(Value *V)
+{
+  if (auto EI = dyn_cast<ExtractValueInst>(V)) {
+    if (EI->getIndices()[0] == 0/* element number of EM in goto/join struct */) {
+      switch (GenXIntrinsic::getGenXIntrinsicID(EI->getAggregateOperand())) {
+        case GenXIntrinsic::genx_simdcf_goto:
+        case GenXIntrinsic::genx_simdcf_join:
+          return true;
+        default:
+          break;
+      }
+    }
+  }
+  return false;
+}
+
+/***********************************************************************
+ * findJoin : given a goto, find the join whose RM it modifies
+ *
+ * Return:    the join instruction, 0 if join not found
+ */
+CallInst *GotoJoin::findJoin(CallInst *Goto)
+{
+  // Find the RM value from the goto. We know that the only
+  // uses of the goto are extracts.
+  ExtractValueInst *RM = nullptr;
+  for (auto ui = Goto->use_begin(), ue = Goto->use_end(); ui != ue; ++ui) {
+    auto Extract = dyn_cast<ExtractValueInst>(ui->getUser());
+    if (Extract && Extract->getIndices()[0] == 1/* RM index in struct */) {
+      RM = Extract;
+      break;
+    }
+  }
+  if (!RM)
+    return nullptr;
+  // Find the single use of the RM in a join, possibly via phi nodes and
+  // other goto instructions.
+  CallInst *Join = nullptr;
+  SetVector<Instruction *> RMVals;
+  RMVals.insert(RM);
+  for (unsigned ri = 0; !Join && ri != RMVals.size(); ++ri) {
+    auto RM = RMVals[ri];
+    for (auto ui = RM->use_begin(), ue = RM->use_end();
+        !Join && ui != ue; ++ui) {
+      auto User = cast<Instruction>(ui->getUser());
+      if (isa<PHINode>(User))
+        RMVals.insert(User);
+      else switch (GenXIntrinsic::getGenXIntrinsicID(User)) {
+        case GenXIntrinsic::genx_simdcf_join:
+          // We have found the join the RM is for.
+          Join = cast<CallInst>(User);
+          break;
+        case GenXIntrinsic::genx_simdcf_goto: {
+          // This is another goto that modifies the same RM. Find the
+          // extractvalue for the updated RM value.
+          ExtractValueInst *Extract = nullptr;
+          for (auto gui = User->use_begin(), gue = User->use_end();
+              gui != gue; ++gui) {
+            auto ThisExtract = dyn_cast<ExtractValueInst>(gui->getUser());
+            if (ThisExtract
+                && ThisExtract->getIndices()[0] == 1/*RM index in struct*/) {
+              Extract = ThisExtract;
+              break;
+            }
+          }
+          if (Extract)
+            RMVals.insert(Extract);
+          break;
+        }
+        default:
+          return nullptr; // unexpected use of RM
+      }
+    }
+  }
+  return Join;
+}
+
+/***********************************************************************
+ * isValidJoin : check that a join is valid
+ *
+ * In a block that is a join label (the "true" successor of a goto/join), there
+ * must be a join at the start of the block, ignoring phi nodes and bitcasts
+ * (which generate no code).
+ *
+ */
+bool GotoJoin::isValidJoin(CallInst *Join)
+{
+  assert(GenXIntrinsic::getGenXIntrinsicID(Join) == GenXIntrinsic::genx_simdcf_join);
+  auto BB = Join->getParent();
+  // If this block has a goto/join predecessor of which it is "true" successor,
+  // check that this block starts with a join -- not necessarily the join we
+  // were given.
+  if (!isJoinLabel(BB))
+    return true;
+  auto Inst = BB->getFirstNonPHIOrDbg();
+  while (isa<BitCastInst>(Inst))
+    Inst = Inst->getNextNode();
+  if (GenXIntrinsic::getGenXIntrinsicID(Inst) == GenXIntrinsic::genx_simdcf_join)
+    return true;
+  return false;
+}
+
+/***********************************************************************
+ * isBranchingJoinLabelBlock : check whether a block has a single join and
+ *    is both a join label and a branching join
+ *
+ * This only works after GenXLateSimdCFConformance.
+ *
+ * For a block for which this returns true, a pass must not insert code.
+ */
+bool GotoJoin::isBranchingJoinLabelBlock(BasicBlock *BB)
+{
+  auto Join = isBranchingJoinBlock(BB);
+  if (!Join || Join != BB->getFirstNonPHIOrDbg())
+    return false;
+  return isJoinLabel(BB);
+}
+
+/***********************************************************************
+ * getBranchingBlockForBB : if this block is "true" successor of branching
+ * goto/join then return this branching block. Otherwise return nullptr.
+ *
+ * Enter:   BB = the basic block
+ *          SkipCriticalEdgeSplitter = if true, skip a critical edge splitter
+ *                block when trying to find a branching goto/join
+ *
+ * SkipCriticalEdgeSplitter only needs to be set when used from inside
+ * GenXSimdCFConformance, before it has removed critical edge splitter blocks
+ * that separate a branching goto/join and the join label.
+ *
+ * "true" successor of branching block has to be a join label if it is not
+ * empty. This function does not test that.
+ *
+ */
+BasicBlock *GotoJoin::getBranchingBlockForBB(BasicBlock *BB,
+                                             bool SkipCriticalEdgeSplitter) {
+  for (auto ui = BB->use_begin(), ue = BB->use_end(); ui != ue; ++ui) {
+    auto PredBr = dyn_cast<BranchInst>(ui->getUser());
+    if (!PredBr || ui->getOperandNo() != PredBr->getNumOperands() - 1)
+      continue;
+    // PredBr is a branch that has BB as its "true" successor. First skip a
+    // critical edge splitter.
+    auto PredBB = PredBr->getParent();
+    if (SkipCriticalEdgeSplitter && PredBr->getNumSuccessors() == 1
+        && PredBr == PredBB->getFirstNonPHIOrDbg() && PredBB->hasOneUse()) {
+      auto ui2 = PredBB->use_begin();
+      PredBr = dyn_cast<BranchInst>(ui2->getUser());
+      if (!PredBr || ui2->getOperandNo() != PredBr->getNumOperands() - 1)
+        continue;
+      PredBB = PredBr->getParent();
+    }
+    // Check to see if it is a goto/join.
+    if (isBranchingGotoJoinBlock(PredBB))
+      return PredBB;
+  }
+  return nullptr;
+}
+
+/***********************************************************************
+ * isJoinLabel : check whether this block needs to be a join label, because
+ *    it is the "true" successor of at least one goto/join branch
+ *
+ * See getBranchingBlockForBB for details.
+ *
+ */
+bool GotoJoin::isJoinLabel(BasicBlock *BB, bool SkipCriticalEdgeSplitter) {
+  return getBranchingBlockForBB(BB, SkipCriticalEdgeSplitter);
+}
+
+/***********************************************************************
+ * isGotoBlock : see if a basic block is a goto block (hence branching),
+ *    returning the goto if so
+ *
+ * See the comment at the top of isBranchingGotoJoinBlock regarding the case
+ * of a goto with an unconditional branch.
+ */
+CallInst *GotoJoin::isGotoBlock(BasicBlock *BB)
+{
+  auto Goto = isBranchingGotoJoinBlock(BB);
+  if (GenXIntrinsic::getGenXIntrinsicID(Goto) != GenXIntrinsic::genx_simdcf_goto)
+    Goto = nullptr;
+  return Goto;
+}
+
+/***********************************************************************
+ * isBranchingJoinBlock : see if a basic block is a branching
+ *    join block, returning the join if so
+ */
+CallInst *GotoJoin::isBranchingJoinBlock(BasicBlock *BB)
+{
+  auto Join = isBranchingGotoJoinBlock(BB);
+  if (GenXIntrinsic::getGenXIntrinsicID(Join) != GenXIntrinsic::genx_simdcf_join)
+    Join = nullptr;
+  return Join;
+}
+
+/***********************************************************************
+ * isBranchingGotoJoinBlock : see if a basic block is a branching
+ *    goto/join block, returning the goto/join if so
+ *
+ * This includes the case of a goto with an unconditional branch, as long as
+ * this is after GenXLateSimdCFConformance (or during GenX*SimdCFConformance
+ * after it has run moveCodeInGotoBlocks), because it relies on
+ * moveCodeInGotoBlocks having sunk the goto and its extracts to the end of the
+ * block.
+ */
+CallInst *GotoJoin::isBranchingGotoJoinBlock(BasicBlock *BB)
+{
+  auto Br = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!Br)
+    return nullptr;
+  if (!Br->isConditional()) {
+    // Unconditional branch. Check for the block ending with a goto or an
+    // extract from a goto.
+    if (Br == &BB->front())
+      return nullptr;
+    Value *LastInst = Br->getPrevNode();
+    if (auto EV = dyn_cast<ExtractValueInst>(LastInst))
+      LastInst = EV->getOperand(0);
+    if (GenXIntrinsic::getGenXIntrinsicID(LastInst) == GenXIntrinsic::genx_simdcf_goto)
+      return cast<CallInst>(LastInst);
+    return nullptr;
+  }
+  // Conditional branch. Check for the condition being an extractvalue from a
+  // goto/join.
+  auto EV = dyn_cast<ExtractValueInst>(Br->getCondition());
+  if (!EV)
+    return nullptr;
+  auto GotoJoin = dyn_cast<CallInst>(EV->getOperand(0));
+  if (!GotoJoin || GotoJoin->getParent() != BB)
+    return nullptr;
+  switch (GenXIntrinsic::getGenXIntrinsicID(GotoJoin)) {
+    case GenXIntrinsic::genx_simdcf_goto:
+    case GenXIntrinsic::genx_simdcf_join:
+      return GotoJoin;
+    default:
+      break;
+  }
+  return nullptr;
+}
+
+/***********************************************************************
+ * getLegalInsertionPoint : ensure an insertion point is legal in the presence
+ *    of SIMD CF
+ *
+ * This is used by a pass that inserts or moves code after
+ * GenXLateSimdCFConformance.
+ *
+ * A branching join label block is not allowed any other code. If the insertion
+ * point is in one of those, move up to its immediate dominator.
+ *
+ * A goto or branching join is not allowed code after the goto/join. If the
+ * insertion point is there, move to just before the goto/join.
+ */
+Instruction *GotoJoin::getLegalInsertionPoint(Instruction *InsertBefore,
+    DominatorTree *DomTree)
+{
+  auto *InsertPoint = InsertBefore;
+  auto *InsertBB = InsertBefore->getParent();
+  while (isBranchingJoinLabelBlock(InsertBB)) {
+    auto Node = DomTree->getNode(InsertBB);
+    assert(Node);
+    auto IDom = Node->getIDom();
+    assert(IDom);
+    InsertBB = IDom->getBlock();
+    InsertPoint = InsertBB->getTerminator();
+  }
+  if (auto GotoJoin = isBranchingGotoJoinBlock(InsertBB))
+    InsertPoint = GotoJoin;
+
+  if (InsertBB == InsertBefore->getParent()) {
+    // If this is the same BB check that our InsertPoint
+    // goes before than InsertBefore
+    auto *TermInst = InsertBB->getTerminator();
+    Instruction *t = InsertPoint;
+    while (t != InsertBefore) {
+      if (t == TermInst) {
+        InsertPoint = InsertBefore;
+        break;
+      }
+      t = t->getNextNode();
+    }
+  }
+  return InsertPoint;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.h
new file mode 100644
index 000000000000..6fd8535376ce
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.h
@@ -0,0 +1,83 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef TARGET_GENXGOTOJOIN_H
+#define TARGET_GENXGOTOJOIN_H
+
+namespace llvm {
+
+class BasicBlock;
+class CallInst;
+class DominatorTree;
+class Instruction;
+class Value;
+
+namespace genx {
+
+// GotoJoin : class containing goto/join related utility functions
+class GotoJoin {
+public:
+
+  // isEMValue : detect whether a value is an EM (execution mask)
+  static bool isEMValue(Value *V);
+
+  // findJoin : given a goto, find the join whose RM it modifies
+  static CallInst *findJoin(CallInst *Goto);
+
+  // isValidJoin : check that the block containing a join is valid
+  static bool isValidJoin(CallInst *Join);
+
+  // isBranchingJoinLabelBlock : check whether a block has a single join and
+  //    is both a join label and a branching join
+  static bool isBranchingJoinLabelBlock(BasicBlock *BB);
+
+  // getBranchingBlockForJoinLabel : if BB is "true" successor of branching
+  // block, return this branching block. If SkipCriticalEdgeSplitter is set,
+  // empty critical edge splitter blocks are skipped.
+  static BasicBlock *getBranchingBlockForBB(BasicBlock *BB,
+                                            bool SkipCriticalEdgeSplitter);
+
+  // isJoinLabel : see if the block is a join label
+  static bool isJoinLabel(BasicBlock *BB, bool SkipCriticalEdgeSplitter = false);
+
+  // isGotoBlock : see if a basic block is a goto block (hence branching), returning the goto if so
+  static CallInst *isGotoBlock(BasicBlock *BB);
+
+  // isBranchingJoinBlock : see if a basic block is a branching join block
+  static CallInst *isBranchingJoinBlock(BasicBlock *BB);
+
+  // isBranchingGotoJoinBlock : see if a basic block is a branching goto/join block
+  static CallInst *isBranchingGotoJoinBlock(BasicBlock *BB);
+
+  // getLegalInsertionPoint : ensure an insertion point is legal in the presence of SIMD CF
+  static Instruction *getLegalInsertionPoint(Instruction *InsertBefore, DominatorTree *DomTree);
+
+};
+
+} // End genx namespace
+} // End llvm namespace
+
+#endif
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXIMadPostLegalization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIMadPostLegalization.cpp
new file mode 100644
index 000000000000..a12c3b5d1a21
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIMadPostLegalization.cpp
@@ -0,0 +1,390 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXIMadLegalization
+/// --------------------
+///
+/// This pass performs the legalization on integer mad to ensure additive
+/// operand is alway single-used so that it could be mapped to accumulator
+/// register.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_IMAD_POST_LEGALIZATION"
+
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+class GenXIMadPostLegalization : public FunctionPass {
+  DominatorTree *DT;
+  GenXBaling *Baling;
+public:
+  static char ID;
+
+  explicit GenXIMadPostLegalization() :
+      FunctionPass(ID), DT(nullptr), Baling(nullptr) {}
+
+  StringRef getPassName() const override {
+    return "GenX IMAD post-legalization pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<GenXFuncBaling>();
+    AU.addPreserved<GenXModule>();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+protected:
+  bool fixMadChain(BasicBlock *);
+};
+
+} // end anonymous namespace
+
+char GenXIMadPostLegalization::ID = 0;
+
+namespace llvm {
+void initializeGenXIMadPostLegalizationPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(GenXIMadPostLegalization, "GenXIMadLegalization", "GenXIMadLegalization", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GenXFuncBaling)
+INITIALIZE_PASS_END(GenXIMadPostLegalization, "GenXIMadLegalization", "GenXIMadLegalization", false, false)
+
+FunctionPass *llvm::createGenXIMadPostLegalizationPass() {
+  initializeGenXIMadPostLegalizationPass(*PassRegistry::getPassRegistry());
+  return new GenXIMadPostLegalization();
+}
+
+static bool isIntegerMadIntrinsic(Value *V) {
+  switch (GenXIntrinsic::getGenXIntrinsicID(V)) {
+  default: break;
+  case GenXIntrinsic::genx_ssmad:
+  case GenXIntrinsic::genx_sumad:
+  case GenXIntrinsic::genx_usmad:
+  case GenXIntrinsic::genx_uumad:
+  case GenXIntrinsic::genx_ssmad_sat:
+  case GenXIntrinsic::genx_sumad_sat:
+  case GenXIntrinsic::genx_usmad_sat:
+  case GenXIntrinsic::genx_uumad_sat:
+    return true;
+  }
+  return false;
+}
+
+static bool isIntegerMulIntrinsic(Value *V) {
+  switch (GenXIntrinsic::getGenXIntrinsicID(V)) {
+  default: break;
+  case GenXIntrinsic::genx_ssmul:
+  case GenXIntrinsic::genx_sumul:
+  case GenXIntrinsic::genx_usmul:
+  case GenXIntrinsic::genx_uumul:
+    return true;
+  }
+  return false;
+}
+
+static std::tuple<BasicBlock *, Instruction *>
+findNearestInsertPt(DominatorTree *DT, ArrayRef<Instruction *> Users) {
+  DenseMap<BasicBlock *, Instruction *> BBs;
+  for (auto U : Users) {
+    auto UseBB = U->getParent();
+    auto MI = BBs.end();
+    bool New = false;
+    std::tie(MI, New) = BBs.insert(std::make_pair(UseBB, U));
+    if (New)
+      continue;
+    // Find the earliest user if more than one users are in the same block.
+    auto BI = UseBB->begin();
+    for (; &*BI != U && &*BI != MI->second; ++BI)
+      /* EMPTY */;
+    MI->second = &*BI;
+  }
+
+  assert(BBs.size() != 0 && "At least one BB should be found!");
+
+  auto MI = BBs.begin();
+  if (BBs.size() == 1)
+    return std::make_tuple(MI->first, MI->second);
+
+  auto BB = MI->first;
+  auto ME = BBs.end();
+  for (++MI; MI != ME; ++MI)
+    BB = DT->findNearestCommonDominator(BB, MI->first);
+
+  MI = BBs.find(BB);
+  if (MI != BBs.end())
+    return std::make_tuple(MI->first, MI->second);
+
+  return std::make_tuple(BB, nullptr);
+}
+
+bool GenXIMadPostLegalization::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  Baling = &getAnalysis<GenXFuncBaling>();
+  bool Changed = false;
+
+  // After this point, we should not do constant folding.
+  Changed |= breakConstantExprs(&F);
+
+  // The following alorithm runs very slowly on large blocks.
+  if (skipOptWithLargeBlock(F))
+    return Changed;
+
+  SmallVector<Instruction *, 16> Deads;
+  for (auto &BB : F) {
+    for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /* EMPTY */) {
+      Instruction *I = &*BI++;
+      if (!isIntegerMadIntrinsic(I))
+        continue;
+      auto II = cast<IntrinsicInst>(I);
+      // Check src2 and duplicate if necessary.
+      Value *S2 = II->getOperand(2);
+      if (S2->hasOneUse()) {
+        // Sink S2 closer to user to shorten acc live ranges.
+        // This is particular important when 32 bit integer multiplications
+        // are not native and acc registers will be used to emulate them.
+        auto I2 = dyn_cast<Instruction>(S2);
+        if (I2 == nullptr || I2->getParent() != I->getParent())
+          continue;
+        if (I2->mayHaveSideEffects() || isa<PHINode>(I2) ||
+            I2->getNextNode() == I)
+          continue;
+        I2->moveBefore(I);
+        Changed = true;
+        continue;
+      }
+      // Only duplicate on selective instructions.
+      if (!GenXIntrinsic::isRdRegion(S2) && !isIntegerMulIntrinsic(S2))
+        continue;
+      Instruction *RII = cast<Instruction>(S2);
+      SmallVector<Instruction *, 16> Others;
+      for (auto UI = S2->use_begin(),
+                UE = S2->use_end(); UI != UE; /* EMPTY */) {
+        Use &U = *UI++;
+        auto InsertPt = cast<Instruction>(U.getUser());
+        if (!isIntegerMadIntrinsic(InsertPt) || U.getOperandNo() != 2) {
+          Others.push_back(InsertPt);
+          continue;
+        }
+        auto NewInst = RII->clone();
+        NewInst->setName(RII->getName() + ".postimad");
+        NewInst->insertBefore(InsertPt);
+        U.set(NewInst);
+      }
+      if (!Others.empty()) {
+        // Find a new place for RII.
+        BasicBlock *NBB = nullptr;
+        Instruction *Pt = nullptr;
+        std::tie(NBB, Pt) = findNearestInsertPt(DT, Others);
+        Pt = Pt ? Pt : NBB->getTerminator();
+        RII->moveBefore(Pt);
+      } else
+        Deads.push_back(RII);
+      Changed = true;
+    }
+  }
+  for (auto I : Deads)
+    I->eraseFromParent();
+
+  for (auto &BB : F)
+    Changed |= fixMadChain(&BB);
+
+
+  return Changed;
+}
+
+bool GenXIMadPostLegalization::fixMadChain(BasicBlock *BB) {
+
+  // Given the bale 'B', collect all its operand instructions in the same basic
+  // block.
+  auto collectUnbaledOpndInsts = [](BasicBlock *BB, Bale &B) {
+    std::vector<Instruction *> Opnds;
+    Instruction *In = nullptr;
+    // Collect operand instructions not baled yet.
+    for (auto I = B.begin(), E = B.end(); I != E; ++I) {
+      bool isFMA = GenXIntrinsic::getAnyIntrinsicID(I->Inst) == Intrinsic::fma;
+      for (unsigned i = 0, e = I->Inst->getNumOperands(); i != e; ++i) {
+        // Skip if that operand is baled.
+        if (I->Info.isOperandBaled(i))
+          continue;
+        auto Op = dyn_cast<Instruction>(I->Inst->getOperand(i));
+        // Skip if it's not an instruction or from the same BB.
+        if (Op && Op->getParent() == BB) {
+          Opnds.push_back(Op);
+          if (isFMA && i == 2)
+            In = Op;
+        }
+      }
+      // Bail out once 'maininst' is processed. The 'maininst' is usually baled
+      // in 'wrregion', 'sat' and similar stuffs, which usually doesn't require
+      // additional operands.
+      if (I->Info.Type == BaleInfo::MAININST)
+        break;
+    }
+    return std::make_pair(In, Opnds);
+  };
+
+  // Given two instructions, 'A' and 'B', in the same basic block, check
+  // whether 'A' dominates 'B'.
+  auto dominates = [](const Instruction *A, const Instruction *B) {
+    const BasicBlock *BB = A->getParent();
+    assert(BB == B->getParent());
+
+    BasicBlock::const_iterator BI = BB->begin();
+    for (; &*BI != A && &*BI != B; ++BI)
+      /*EMPTY*/;
+
+    return &*BI == A;
+  };
+
+  bool Changed = false;
+  std::set<Instruction *> FMAs; // 'fma' already handled.
+  for (auto BI = BB->rbegin(), BE = BB->rend(); BI != BE; ++BI) {
+    auto Inst = &*BI;
+    Bale OutB;
+    Baling->buildBale(Inst, &OutB);
+    // Skip bale non-FMA bale.
+    if (!OutB.getMainInst())
+      continue;
+    auto CandidateInsn = OutB.getMainInst()->Inst;
+    assert(CandidateInsn);
+    if (GenXIntrinsic::getAnyIntrinsicID(CandidateInsn) != Intrinsic::fma)
+      continue;
+    // Skip if it's already handled.
+    if (FMAs.count(CandidateInsn))
+      continue;
+
+    // Collection of all inputs for the chain curently discovered.
+    std::set<Instruction *> Inputs;
+    // The mad chain itself.
+    std::vector<Bale> Chain;
+    Chain.push_back(OutB);
+    FMAs.insert(CandidateInsn);
+    do {
+      auto &OutB = Chain.back();
+      Instruction *In = nullptr;
+      std::vector<Instruction *> Opnds;
+      // Collect all operands so that we could grow the chain through the
+      // chain-in.
+      std::tie(In, Opnds) = collectUnbaledOpndInsts(BB, OutB);
+      if (!In || !In->hasOneUse())
+        break;
+      // Check whether all inputs collected so far dominates 'In' so that we
+      // won't add extra register pressure.
+      for (auto &I : Inputs) {
+        if (dominates(I, In))
+          continue;
+        In = nullptr;
+        break;
+      }
+      // Skip chain building if there are inputs won't be dominated by the new
+      // chain-in.
+      if (!In)
+        break;
+      // Check inputs from the tip of chain, i.e. the current chain-out.
+      for (auto &OpI : Opnds) {
+        // Skip the chain-in.
+        if (OpI == In)
+          continue;
+        // Skip if that input dominates the chain-in but record it as inputs.
+        //
+        // FIXME: revisit the following check. This stops sinking non-mad bales
+        // which may increase register pressure and inserts non-mad instructions
+        // among mads.
+        if (true || !OpI->hasOneUse() || dominates(OpI, In)) {
+          Inputs.insert(OpI);
+          continue;
+        }
+        // TODO: So far, only traverse one step further from that chain-out
+        // operands.
+        Bale OpB;
+        Baling->buildBale(OpI, &OpB);
+        std::vector<Instruction *> SubOpnds;
+        std::tie(std::ignore, SubOpnds) = collectUnbaledOpndInsts(BB, OpB);
+        for (auto &SubI : SubOpnds) {
+          if (dominates(SubI, In)) {
+            Inputs.insert(SubI);
+            continue;
+          }
+          // Stop chaining as 'SubI' intervenes between 'In' and 'Out'.
+          In = nullptr;
+          break;
+        }
+        if (!In)
+          break;
+        Chain.push_back(OpB);
+      }
+      if (!In)
+        break;
+      // Grow the chain by appending this chain-in.
+      Bale InB;
+      Baling->buildBale(In, &InB);
+      Chain.push_back(InB);
+      // Stop chaining if it's not mad any more.
+      if (!InB.getMainInst())
+        break;
+      auto CandidateInst = InB.getMainInst()->Inst;
+      assert(CandidateInst);
+      if (GenXIntrinsic::getAnyIntrinsicID(CandidateInst) != Intrinsic::fma)
+        break;
+      FMAs.insert(CandidateInst);
+    } while (1);
+    // Cluster the discovered chain together.
+    if (FMAs.size() > 1) {
+      Instruction *Pos = nullptr;
+      for (auto I = Chain.begin(), E = Chain.end(); I != E; ++I) {
+        for (auto II = I->rbegin(), IE = I->rend(); II != IE; ++II) {
+          if (!Pos) {
+            Pos = II->Inst;
+            continue;
+          }
+          // Skip phi which is not movable.
+          if (isa<PHINode>(II->Inst))
+            break;
+          II->Inst->moveBefore(Pos);
+          Pos = II->Inst;
+          Changed = true;
+        }
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXInlineAsmLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXInlineAsmLowering.cpp
new file mode 100644
index 000000000000..22cd57501ee5
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXInlineAsmLowering.cpp
@@ -0,0 +1,345 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXInlineAsmLowering
+/// ------------
+/// This pass recreates VISA inline assembly with new types
+/// if 'cr' constraint is used. Also pass  inserts constraints
+/// information as metadata in order not to parse constraints
+/// string  every time in each pass where this information is needed.
+///
+//===----------------------------------------------------------------------===//
+
+#include "GenX.h"
+#include "GenXGotoJoin.h"
+#include "GenXIntrinsics.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "GenXVisa.h"
+#include "visa_igc_common_header.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+class GenXInlineAsmLowering : public FunctionPass {
+  using ConstraintInfoVector = InlineAsm::ConstraintInfoVector;
+  using ConstraintInfo = InlineAsm::ConstraintInfo;
+  using GenXConstraintInfoVector = std::vector<GenXInlineAsmInfo>;
+
+private:
+  LLVMContext *Context = nullptr;
+  SmallVector<Instruction *, 8> ToErase;
+
+  MDNode *createInlineAsmMetadata(
+      CallInst *CI,
+      const InlineAsm::ConstraintInfoVector &ConstraintsInfo) const;
+
+  Type *rewriteTypeForConstraintIfNeeded(
+      Type *Ty, const GenXInlineAsmInfo &ConstraintInfo) const;
+  Type *rewriteTypeForCR(Type *CRType) const;
+
+  FunctionType *rewriteFunctionTypeForInlineAsmIfNeeded(
+      CallInst *CI, const GenXConstraintInfoVector &ConstraintsInfo) const;
+
+  void replaceInlineAsmUses(CallInst *Of, CallInst *With,
+                            const GenXConstraintInfoVector &ConstraintsInfo);
+
+  CallInst *
+  recreateInlineAsmWithCR(CallInst *CI,
+                          const GenXConstraintInfoVector &ConstraintsInfo);
+
+public:
+  static char ID;
+  explicit GenXInlineAsmLowering() : FunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "GenX VISA inline asm lowering";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+
+} // end namespace
+
+char GenXInlineAsmLowering::ID = 0;
+namespace llvm {
+void initializeGenXInlineAsmLoweringPass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXInlineAsmLowering, "GenXInlineAsmLowering",
+                      "GenXInlineAsmLowering", false, false)
+INITIALIZE_PASS_END(GenXInlineAsmLowering, "GenXInlineAsmLowering",
+                    "GenXInlineAsmLowering", false, false)
+
+FunctionPass *llvm::createGenXInlineAsmLoweringPass() {
+  initializeGenXInlineAsmLoweringPass(*PassRegistry::getPassRegistry());
+  return new GenXInlineAsmLowering;
+}
+
+void GenXInlineAsmLowering::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<GenXModule>();
+}
+
+bool GenXInlineAsmLowering::runOnFunction(Function &F) {
+  auto GM = getAnalysisIfAvailable<GenXModule>();
+  if (GM && !GM->HasInlineAsm())
+    return false;
+
+  // Collect inline asm worklist
+  auto InlineAsmsToProcess =
+      make_filter_range(instructions(&F), [&](Instruction &I) {
+        auto *CI = dyn_cast<CallInst>(&I);
+        // No need to process inline asm with empty constraint string
+        return CI && CI->isInlineAsm() &&
+               !cast<InlineAsm>(CI->getCalledValue())
+                    ->getConstraintString()
+                    .empty();
+      });
+
+  SmallVector<CallInst *, 8> InlineAsms;
+  llvm::transform(InlineAsmsToProcess, std::back_inserter(InlineAsms),
+                  [&](Instruction &I) { return cast<CallInst>(&I); });
+
+  if (InlineAsms.empty())
+    return false;
+
+  Context = &InlineAsms[0]->getContext();
+  for (auto *CI : InlineAsms) {
+    auto *IA = cast<InlineAsm>(CI->getCalledValue());
+    InlineAsm::ConstraintInfoVector ConstraintsInfo = IA->ParseConstraints();
+    MDNode *ConstraintsMD = createInlineAsmMetadata(CI, ConstraintsInfo);
+    GenXConstraintInfoVector GenXConstraintsInfo =
+        genx::getGenXInlineAsmInfo(ConstraintsMD);
+
+    // No need to recreate asm expression if
+    // there is no 'cr' constraint. Set created metadata and return.
+    if (!genx::hasConstraintOfType(GenXConstraintsInfo,
+                                   ConstraintType::Constraint_cr)) {
+      CI->setMetadata(genx::MD_genx_inline_asm_info, ConstraintsMD);
+      continue;
+    }
+
+    // Create new inline asm and don't forget to set
+    // earlier created metadata.
+    CallInst *NewCI = recreateInlineAsmWithCR(CI, GenXConstraintsInfo);
+    NewCI->setMetadata(genx::MD_genx_inline_asm_info, ConstraintsMD);
+  }
+
+  for (auto *I : ToErase)
+    I->eraseFromParent();
+  ToErase.clear();
+
+  return true;
+}
+
+// Creating metadata for inline asm constraints
+// in order not to parse constraints string  every time in each pass
+// where this information is needed.
+MDNode *GenXInlineAsmLowering::createInlineAsmMetadata(
+    CallInst *CI, const ConstraintInfoVector &ConstraintsInfo) const {
+  assert(!ConstraintsInfo.empty() && "Non empty constraints expected");
+  assert(CI->isInlineAsm() && "Inline asm expected");
+
+  Type *Int1Ty = Type::getInt1Ty(*Context);
+  Type *Int32Ty = Type::getInt32Ty(*Context);
+  std::vector<Metadata *> Entries;
+  for (auto &&Info : ConstraintsInfo) {
+    std::string Codes;
+    if (genx::isInlineAsmMatchingInputConstraint(Info))
+      Codes = genx::getInlineAsmCodes(
+          ConstraintsInfo[genx::getInlineAsmMatchedOperand(Info)]);
+    else
+      Codes = genx::getInlineAsmCodes(Info);
+
+    genx::ConstraintType CTy = genx::getInlineAsmConstraintType(Codes);
+    if (CTy == ConstraintType::Constraint_unknown)
+      Context->emitError(CI, "Unsupported constraint '" + Codes +
+                                 "' in inline assembly");
+
+    Metadata *EntryMD[3] = {
+        ConstantAsMetadata::get(
+            ConstantInt::get(Int32Ty, static_cast<uint32_t>(CTy))),
+        ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Info.MatchingInput)),
+        ConstantAsMetadata::get(ConstantInt::get(
+            Int1Ty, (Info.Type == InlineAsm::ConstraintPrefix::isOutput)))};
+    Entries.push_back(MDNode::get(*Context, EntryMD));
+  }
+  return MDTuple::get(*Context, Entries);
+}
+
+Type *GenXInlineAsmLowering::rewriteTypeForConstraintIfNeeded(
+    Type *Ty, const GenXInlineAsmInfo &ConstraintInfo) const {
+  switch (ConstraintInfo.getConstraintType()) {
+  default:
+    return Ty;
+  case ConstraintType::Constraint_cr:
+    return rewriteTypeForCR(Ty);
+  }
+}
+
+Type *GenXInlineAsmLowering::rewriteTypeForCR(Type *CRType) const {
+  assert(CRType->isIntOrIntVectorTy() &&
+         "Expected integer inputs for 'cr' constraint");
+  Type *Int1Ty = Type::getInt1Ty(*Context);
+  return CRType->isVectorTy()
+             ? VectorType::get(Int1Ty, CRType->getVectorNumElements())
+             : Int1Ty;
+}
+
+// If there exist 'cr' for now output a new result type must be constructed
+FunctionType *GenXInlineAsmLowering::rewriteFunctionTypeForInlineAsmIfNeeded(
+    CallInst *CI, const GenXConstraintInfoVector &ConstraintsInfo) const {
+  // Rewriting return type
+  unsigned NumOutputs = genx::getInlineAsmNumOutputs(CI);
+  std::vector<Type *> NewResultsTypes;
+  if (NumOutputs == 1) {
+    NewResultsTypes.push_back(
+        rewriteTypeForConstraintIfNeeded(CI->getType(), ConstraintsInfo[0]));
+  } else if (NumOutputs > 1) {
+    auto *ST = cast<StructType>(CI->getType());
+    std::transform(ST->element_begin(), ST->element_end(),
+                   ConstraintsInfo.begin(), std::back_inserter(NewResultsTypes),
+                   [&](Type *Ty, const GenXInlineAsmInfo &Info) {
+                     return rewriteTypeForConstraintIfNeeded(Ty, Info);
+                   });
+  }
+
+  // New return type: struct for multiple outputs,
+  // void for no outputs, and one exact type for single output
+  Type *NewRetType;
+  if (NewResultsTypes.empty())
+    NewRetType = Type::getVoidTy(*Context);
+  else if (NewResultsTypes.size() == 1)
+    NewRetType = NewResultsTypes[0];
+  else
+    NewRetType = StructType::get(*Context, NewResultsTypes);
+
+  // Rewritng params types
+  std::vector<Type *> NewParamsTypes;
+  std::transform(CI->arg_begin(), CI->arg_end(),
+                 ConstraintsInfo.begin() + NumOutputs,
+                 std::back_inserter(NewParamsTypes),
+                 [&](Value *V, const GenXInlineAsmInfo &Info) {
+                   return rewriteTypeForConstraintIfNeeded(V->getType(), Info);
+                 });
+  return FunctionType::get(NewRetType, NewParamsTypes, false);
+}
+
+// If result type differs than iterate over all
+// users of original call and replace it's
+// uses with new outputs. Thus new extractelements and
+// zero exstensions might be created. Existing extracts should be eliminated.
+void GenXInlineAsmLowering::replaceInlineAsmUses(
+    CallInst *Of, CallInst *With,
+    const GenXConstraintInfoVector &ConstraintsInfo) {
+  if (Of->getType() == With->getType()) {
+    Of->replaceAllUsesWith(With);
+    return;
+  }
+  IRBuilder<> Builder(*Context);
+  Builder.SetInsertPoint(With->getNextNode());
+  unsigned NumOutputs = genx::getInlineAsmNumOutputs(Of);
+  if (NumOutputs == 1) {
+    Value *NewResZExt = Builder.CreateZExt(With, Of->getType(), ".asm.zext.cr");
+    Of->replaceAllUsesWith(NewResZExt);
+    return;
+  }
+
+  // Create new extractvalues and replace all uses
+  for (auto *U : Of->users()) {
+    Value *ToZext = With;
+    auto *EV = cast<ExtractValueInst>(U);
+    ToErase.push_back(EV);
+    unsigned OutputConstraintIdx = EV->getIndices()[0];
+    ToZext =
+        Builder.CreateExtractValue(ToZext, OutputConstraintIdx, "asmresult.cr");
+    // Zero extension needed only for 'cr' output
+    if (ConstraintsInfo[OutputConstraintIdx].getConstraintType() ==
+        genx::ConstraintType::Constraint_cr)
+      ToZext = Builder.CreateZExt(ToZext, U->getType(), ".asmresult.zext.cr");
+    U->replaceAllUsesWith(ToZext);
+  }
+}
+
+// If inline assembly uses 'cr' constraints (for now)
+// all types should be converted to i1. So inserting
+// truncations for inputs and zero extensions for outputs.
+CallInst *GenXInlineAsmLowering::recreateInlineAsmWithCR(
+    CallInst *CI, const GenXConstraintInfoVector &ConstraintsInfo) {
+  assert(!ConstraintsInfo.empty() && "Non empty constraints expected");
+  assert(CI->isInlineAsm() && "Inline asm expected");
+
+  // If there exist 'cr' output a new result type must be constructed
+  FunctionType *NewFTy =
+      rewriteFunctionTypeForInlineAsmIfNeeded(CI, ConstraintsInfo);
+
+  // New function types for 'cr':
+  // any_int       -> i1
+  // <any_int x N> -> <i1 x N>
+  //
+  // Create truncation for input args if needed
+  IRBuilder<> Builder(CI);
+  std::vector<Value *> NewArgs;
+  std::transform(CI->arg_begin(), CI->arg_end(), NewFTy->param_begin(),
+                 std::back_inserter(NewArgs), [&](Value *Arg, Type *NewArgTy) {
+                   if (Arg->getType() != NewArgTy)
+                     Arg = Builder.CreateTrunc(Arg, NewArgTy, ".trunc.cr");
+                   return Arg;
+                 });
+
+  // Create exactly the same inline assembly but with new function type
+  auto *IA = cast<InlineAsm>(CI->getCalledValue());
+  InlineAsm *NewIA = InlineAsm::get(
+      NewFTy, IA->getAsmString(), IA->getConstraintString(),
+      IA->hasSideEffects(), IA->isAlignStack(), IA->getDialect());
+  CallInst *NewCI = Builder.CreateCall(NewIA, NewArgs, ".asm.cr");
+  NewCI->setAttributes(CI->getAttributes());
+  NewCI->setDebugLoc(CI->getDebugLoc());
+
+  replaceInlineAsmUses(CI, NewCI, ConstraintsInfo);
+  ToErase.push_back(CI);
+
+  return NewCI;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXInstCombineCleanup.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXInstCombineCleanup.cpp
new file mode 100644
index 000000000000..a3d179cdaba3
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXInstCombineCleanup.cpp
@@ -0,0 +1,141 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXInstCombineCleanupPass
+/// --------------------------
+///
+/// For switch instructions llvm 7.0 instcombine aggressively shrikns the type
+/// of the condition variable. This can introduce types which are unsupported
+/// in GenX IR (like i2, i27, etc)
+/// The pass tries to detect such switch instructions and modify them to use
+/// the original condition instead of a truncated one.
+/// The idea is to do it using a standard llvm passes, so we just try to do the
+/// opposite to inst combine change and expect irbuilder folding or other passes
+/// to change code as it was before.
+
+#define DEBUG_TYPE "GENX_INSTCOMBCLEANUP"
+
+#include "GenX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+class GenXInstCombineCleanup : public FunctionPass {
+public:
+  static char ID;
+
+  explicit GenXInstCombineCleanup() : FunctionPass(ID) { }
+
+  StringRef getPassName() const override { return "GenX InstCombineCleanup"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+char GenXInstCombineCleanup::ID = 0;
+namespace llvm { void initializeGenXInstCombineCleanupPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXInstCombineCleanup, "GenXInstCombineCleanup", "GenXInstCombineCleanup", false, false)
+INITIALIZE_PASS_END(GenXInstCombineCleanup, "GenXInstCombineCleanup", "GenXInstCombineCleanup", false, false)
+
+FunctionPass *llvm::createGenXInstCombineCleanup()
+{
+  initializeGenXInstCombineCleanupPass(*PassRegistry::getPassRegistry());
+  return new GenXInstCombineCleanup();
+}
+
+void GenXInstCombineCleanup::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.setPreservesCFG();
+}
+
+bool typeMustBeChanged(Type *Ty) {
+  assert(Ty);
+  if (!Ty->isIntegerTy())
+    return false;
+  unsigned Size = Ty->getPrimitiveSizeInBits();
+  // Possible sizes are 1, 8, 16, 32, ... (2 and 4 must be excluded)
+  if (isPowerOf2_32(Size) && !(genx::BoolBits < Size && Size < genx::ByteBits))
+    return false;
+  return true;
+}
+
+bool GenXInstCombineCleanup::runOnFunction(Function &F)
+{
+  bool Modified = false;
+
+#if (LLVM_VERSION_MAJOR <= 7)
+  LLVM_DEBUG(dbgs() << "running GenXInstCombineCleanup on " << F.getName() << "\n");
+
+  LLVMContext &Ctx = F.getContext();
+  IRBuilder<> Builder(Ctx);
+
+  for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    auto Switch = dyn_cast<SwitchInst>(&*I);
+    if (!Switch)
+      continue;
+
+    auto Cond = Switch->getCondition();
+    Type *CondTy = Cond->getType();
+    if (!typeMustBeChanged(CondTy))
+      continue;
+
+    unsigned CondSize = CondTy->getPrimitiveSizeInBits();
+    assert(CondSize != genx::BoolBits &&
+           "CondSize == 1 is not expected here. See typeMustBeChanged");
+    // Round up to the next power of 2 skipping i2 and i4 (i3 -> i8, i2 -> i8,
+    // etc)
+    unsigned Size =
+        CondSize < genx::ByteBits ? genx::ByteBits : NextPowerOf2(CondSize);
+
+    Type *NewTy = Type::getIntNTy(Ctx, Size);
+
+    Builder.SetInsertPoint(Switch);
+    Value *NewCond =
+        Builder.CreateSExt(Cond, NewTy, Switch->getName() + ".condSExt");
+    Switch->setCondition(NewCond);
+
+    for (auto Case : Switch->cases()) {
+      APInt UpdatedCase = Case.getCaseValue()->getValue().sext(Size);
+      Case.setValue(ConstantInt::get(Ctx, UpdatedCase));
+    }
+
+    Modified = true;
+  }
+#endif
+
+  return Modified;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.cpp
new file mode 100644
index 000000000000..df8c4e93244c
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.cpp
@@ -0,0 +1,201 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file contains a table of extra information about the llvm.genx.*
+// intrinsics, used by the vISA register allocator and function writer to
+// decide exactly what operand type to use. The more usual approach in an LLVM
+// target is to have an intrinsic map to an instruction in instruction
+// selection, then have register category information on the instruction. But
+// we are not using the target independent code generator, we are generating
+// code directly from LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+#include "GenXIntrinsics.h"
+#include "IsaDescription.h"
+#include "visa_igc_common_header.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+
+// In this table:
+//
+// Each ALU and shared function intrinsic has a record giving information
+// about its operands, and how it is written as a vISA instruction. The
+// record has an initial field giving the intrinsic ID, then a number of
+// fields where each corresponds to a field in the vISA instruction.
+//
+// A field may be several values combined with the | operator. The first
+// value is the operand category (GENERAL etc), or one of a set of
+// non-register operand categories (LITERAL, BYTE), or END to terminate
+// the record. Other modifier values may be combined, such as SIGNED.
+// The LLVM IR argument index plus 1 is also combined in, or 0 for the
+// return value.
+
+// Video Analytics intrinsic helper macros, mainly to avoid large blocks
+// of near-identical code in the intrinsics look-up table and also to
+// aid readability.
+
+const GenXIntrinsicInfo::DescrElementType GenXIntrinsicInfo::Table[] = {
+
+// Region access intrinsics do not appear in this table
+
+#include "GenXIntrinsicInfoTable.inc"
+
+    END};
+
+GenXIntrinsicInfo::GenXIntrinsicInfo(unsigned IntrinId) : Args(0) {
+  const auto *p = Table;
+  for (;;) {
+    if (*p == END)
+      break; // intrinsic not found; leave Args pointing at END field
+    if (IntrinId == *p++)
+      break;
+    // Scan past the rest of this entry.
+    while (*p++ != END)
+      ;
+  }
+  // We have found the right entry.
+  Args = p;
+}
+
+// Get the category and modifier for an arg idx (-1 means return value).
+// The returned ArgInfo struct contains just the short read from the table,
+// and has methods for accessing the various fields.
+GenXIntrinsicInfo::ArgInfo GenXIntrinsicInfo::getArgInfo(int Idx) {
+  // Read through the fields in the table to find the one with the right
+  // arg index...
+  for (const auto *p = Args; *p; p++) {
+    ArgInfo AI(*p);
+    if (AI.isRealArgOrRet() && AI.getArgIdx() == Idx)
+      return AI;
+  }
+  // Field with requested arg index was not found.
+  return END;
+}
+
+// Return the starting point of any trailing null (zero) arguments
+// for this call. If the intrinsic does not have a ARGCOUNT descriptor
+// this will always return the number of operands to the call (ie, there
+// is no trailing null zone), even if there are some trailing nulls.
+unsigned GenXIntrinsicInfo::getTrailingNullZoneStart(CallInst *CI) {
+  unsigned TrailingNullStart = CI->getNumArgOperands();
+
+  const auto *p = Args;
+  for (; *p; p++) {
+    ArgInfo AI(*p);
+    if (AI.getCategory() == ARGCOUNT)
+      break;
+  }
+
+  if (*p) {
+    ArgInfo ACI(*p);
+    unsigned BaseArg = ACI.getArgIdx();
+
+    TrailingNullStart = BaseArg;
+    for (unsigned Idx = BaseArg; Idx < CI->getNumArgOperands(); ++Idx) {
+      if (auto CA = dyn_cast<Constant>(CI->getArgOperand(Idx))) {
+        if (CA->isNullValue())
+          continue;
+      }
+      TrailingNullStart = Idx + 1;
+    }
+
+    if (TrailingNullStart < BaseArg + ACI.getArgCountMin())
+      TrailingNullStart = BaseArg + ACI.getArgCountMin();
+  }
+
+  return TrailingNullStart;
+}
+
+/***********************************************************************
+ * getExecSizeAllowedBits : get bitmap of which execsize values are allowed
+ *                          for this intrinsic
+ *
+ * Return:  bit N set if execution size 1<<N is allowed
+ */
+unsigned GenXIntrinsicInfo::getExecSizeAllowedBits() {
+  for (const auto *p = Args; *p; p++) {
+    if (!(*p & GENERAL)) {
+      switch (*p & CATMASK) {
+      case EXECSIZE:
+        return 0x3f;
+      case EXECSIZE_GE2:
+        return 0x3e;
+      case EXECSIZE_GE4:
+        return 0x3c;
+      case EXECSIZE_GE8:
+        return 0x38;
+      case EXECSIZE_NOT2:
+        return 0x3d;
+      }
+    }
+  }
+  return 0x3f;
+}
+
+/***********************************************************************
+ * getPredAllowed : determine if this intrinsic is allowed to have
+ *                  a predicated destination mask.
+ *
+ * Return:  true if it permitted, false otherwise.
+ */
+bool GenXIntrinsicInfo::getPredAllowed() {
+  // Simply search the intrinsic description for an IMPLICITPRED
+  // entry. Not very efficient, but the situations where this
+  // check is needed are expected to be infrequent.
+  for (const auto *p = getInstDesc(); *p; ++p) {
+    ArgInfo AI(*p);
+    if (AI.getCategory() == IMPLICITPRED)
+      return true;
+  }
+
+  return false;
+}
+
+unsigned GenXIntrinsicInfo::getOverridedExecSize(CallInst *CI,
+                                                 const GenXSubtarget *ST) {
+  auto CalledF = CI->getCalledFunction();
+  assert(CalledF);
+  auto ID = GenXIntrinsic::getGenXIntrinsicID(CalledF);
+
+  switch (ID) {
+  default:
+    break;
+  // Exec size of intrinsics with channels are inferred from address operand.
+  case GenXIntrinsic::genx_gather4_scaled2:
+    return CI->getArgOperand(4)->getType()->getVectorNumElements();
+  case GenXIntrinsic::genx_raw_send:
+  case GenXIntrinsic::genx_raw_sends:
+  case GenXIntrinsic::genx_raw_send_noresult:
+  case GenXIntrinsic::genx_raw_sends_noresult:
+    return 16;
+  }
+
+  return 0;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.h
new file mode 100644
index 000000000000..9162c57e6126
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.h
@@ -0,0 +1,324 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file declares a class to access a table of extra information about the
+// llvm.genx.* intrinsics, used by the vISA register allocator and function
+// writer to decide exactly what operand type to use. The more usual approach
+// in an LLVM target is to have an intrinsic map to an instruction in
+// instruction selection, then have register category information on the
+// instruction. But we are not using the target independent code generator, we
+// are generating code directly from LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+#ifndef GENXINTRINSICS_H
+#define GENXINTRINSICS_H
+#include "GenXVisa.h"
+
+#define GENX_ITR_CATVAL(v) ((v) << CATBASE)
+#define GENX_ITR_FLAGENUM(o, v) ((v) << ((o) + FLAGBASE))
+#define GENX_ITR_FLAGMASK(o, w) (((1 << (w)) - 1) << ((o) + FLAGBASE))
+#define GENX_ITR_FLAGVAL(o) GENX_ITR_FLAGENUM(o, 1)
+
+namespace llvm {
+  class CallInst;
+
+class GenXIntrinsicInfo {
+public:
+  typedef uint32_t DescrElementType;
+private:
+  const DescrElementType *Args;
+  static const DescrElementType Table[];
+public:
+  enum {
+    // General format of intrinsic descriptor words:
+    //   Bits 31..24: Category enumeration
+    //   Bits 23..8:  Flags, if any, meaning and layout depends on category
+    //   Bits 7..0:   Operand or literal, if any
+    //
+    // One exception to the above is LITERAL, where everything that isn't
+    // the category field is assumed to be the literal value.
+    //
+    // If you want to re-apportion space in the descriptor word (typically
+    // because you need another flag and you can't express what you need to
+    // do without creating one) then just modify FLAGBASE and FLAGWIDTH
+    // below, and everything else will shake itself out appropriately.
+    // Currently 8 bits are allocated for the category enumaration bitfield,
+    // although the actual enumeration values defined only require 6 bits -
+    // and there is still plenty of space left over even within that.
+    // Similarly, there are 8 bits allocated to the operand bitfield, and
+    // currently the maximum needed is 5.
+    //
+    // At the moment, the GENERAL category has 5 unused flag bits available
+    // to it, the RAW category has 13 unused bits, and the ARGCOUNT category
+    // has 13 unused bits. No other categories make use of the flags yet,
+    // so it should be a good while yet before it's necessary to resize
+    // the bitfields.
+
+    FLAGBASE = 8,
+    FLAGWIDTH = 16,
+    CATBASE = FLAGBASE + FLAGWIDTH,
+
+    CATMASK = ~((1 << CATBASE) - 1),
+    FLAGMASK = ((~((1 << FLAGBASE) - 1)) ^ CATMASK),
+    OPNDMASK = ~(CATMASK | FLAGMASK),
+
+    // A field that does not contain an operand number or literal value:
+    END =                   0, // end of instruction description
+    IMPLICITPRED =          GENX_ITR_CATVAL(0x01), // implicit predication field
+    NULLRAW =               GENX_ITR_CATVAL(0x02), // null raw operand
+    ISBARRIER =             GENX_ITR_CATVAL(0x03), // intrinsic is barrier: suppress nobarrier attribute
+
+    EXECSIZE =              GENX_ITR_CATVAL(0x04), // execution size
+    EXECSIZE_GE2 =          GENX_ITR_CATVAL(0x05), // execution size (must be >= 2)
+    EXECSIZE_GE4 =          GENX_ITR_CATVAL(0x06), // execution size (must be >= 4)
+    EXECSIZE_GE8 =          GENX_ITR_CATVAL(0x07), // execution size (must be >= 8)
+    EXECSIZE_NOT2 =         GENX_ITR_CATVAL(0x08), // execution size (cannot be 2)
+
+    // A field that contains a literal value the operand field
+    LITERAL =               GENX_ITR_CATVAL(0x09), // literal byte (usually opcode)
+    LITMASK =               ~CATMASK,
+
+    // A field that contains an operand number, other than general:
+    FIRST_OPERAND =         GENX_ITR_CATVAL(0x10),
+    LOG2OWORDS =            GENX_ITR_CATVAL(0x10), // log2 number of owords
+    NUMGRFS =               GENX_ITR_CATVAL(0x11), // rounded up number of GRFs
+    EXECSIZE_FROM_ARG =     GENX_ITR_CATVAL(0x12), // exec_size field inferred from width of
+                                                   // predication arg
+    SVMGATHERBLOCKSIZE =    GENX_ITR_CATVAL(0x13), // svm gather block size, inferred from data type
+    LOG2OWORDS_PLUS_8 =     GENX_ITR_CATVAL(0x14), // log2 number of owords, plus 8
+    GATHERNUMELTS =         GENX_ITR_CATVAL(0x15), // gather/scatter "num elements" field
+    TRANSPOSEHEIGHT =       GENX_ITR_CATVAL(0x16), // block_height field in transpose
+    LOG2ELTSIZE =           GENX_ITR_CATVAL(0x17), // log2 element size in gather/scatter
+    ARGCOUNT =              GENX_ITR_CATVAL(0x18), // Byte containing number of non-undef operands
+    EXECSIZE_FROM_BYTE =    GENX_ITR_CATVAL(0x19), // exec_size specified in byte
+      ARGCOUNTMASK =        GENX_ITR_FLAGMASK(0, 3), // Space for minumum argument count
+      ARGCOUNTMIN1 =        GENX_ITR_FLAGENUM(0, 1), // Must have at least one argument
+
+    // A field that contains an operand number, other than general, and it
+    // is the "real" use of the operand, rather than an auxiliary use
+    // such as a "number of GRFs" field relating to this operand.
+    FIRST_REAL_OPERAND =    GENX_ITR_CATVAL(0x20),
+    BYTE =                  GENX_ITR_CATVAL(0x20), // constant byte operand
+    SHORT =                 GENX_ITR_CATVAL(0x21), // constant short operand
+    INT =                   GENX_ITR_CATVAL(0x22), // constant int operand
+    ADDRESS =               GENX_ITR_CATVAL(0x23), // address operand
+    PREDICATE =             GENX_ITR_CATVAL(0x24), // predicate operand
+      PREDICATE_ZEROED =    GENX_ITR_FLAGVAL(0),
+    Z_PREDICATE = PREDICATE | PREDICATE_ZEROED,
+    SAMPLER =               GENX_ITR_CATVAL(0x25), // sampler operand
+    SURFACE =               GENX_ITR_CATVAL(0x26), // surface operand
+    VME =                   GENX_ITR_CATVAL(0x27), // vme operand
+    // byte height of media 2D block, inferred from the width operand
+    // pointed at and the size of the return type or final operand type
+    MEDIAHEIGHT =           GENX_ITR_CATVAL(0x28),
+    // predication control field from explicit predicate arg
+    PREDICATION =           GENX_ITR_CATVAL(0x29),
+    // chmask field in load/sample, with exec size bit
+    SAMPLECHMASK =          GENX_ITR_CATVAL(0x2a),
+    // does not appear in the vISA output, but needs to be two address
+    // coalesced with result
+    TWOADDR =               GENX_ITR_CATVAL(0x2b),
+    CONSTVI1ASI32 =         GENX_ITR_CATVAL(0x2c), // constant vXi1 written as i32 (used in setp)
+    RAW =                   GENX_ITR_CATVAL(0x2d), // raw operand or result,
+      // Raw descriptor flags, 3 bits used
+      RAW_UNSIGNED =        GENX_ITR_FLAGVAL(0),   // raw operand/result must be unsigned
+      RAW_SIGNED =          GENX_ITR_FLAGVAL(1),   // raw operand/result must be signed
+      RAW_NULLALLOWED =     GENX_ITR_FLAGVAL(2),   // raw operand or result can be null (V0)
+    URAW =                  RAW | RAW_UNSIGNED,
+    SRAW =                  RAW | RAW_SIGNED,
+    EXECSIZE_NOMASK =       GENX_ITR_CATVAL(0x2e), // execution size with NoMask
+
+    // A general operand
+    GENERAL =               GENX_ITR_CATVAL(0x30),
+    // Modifiers for destination or source, 7 bits used
+    UNSIGNED =              GENX_ITR_FLAGVAL(0), // int type forced to unsigned
+    SIGNED =                GENX_ITR_FLAGVAL(1), // int type forced to signed
+    OWALIGNED =             GENX_ITR_FLAGVAL(2), // must be oword aligned
+    GRFALIGNED =            GENX_ITR_FLAGVAL(3), // must be grf aligned
+    RESTRICTION =           GENX_ITR_FLAGMASK(4, 3), // field with operand width restriction
+      FIXED4 =              GENX_ITR_FLAGENUM(4, 1), // operand is fixed size 4 vector and contiguous
+      CONTIGUOUS =          GENX_ITR_FLAGENUM(4, 2), // operand must be contiguous
+      SCALARORCONTIGUOUS =  GENX_ITR_FLAGENUM(4, 3), // operand must be stride 0 or contiguous
+      TWICEWIDTH =          GENX_ITR_FLAGENUM(4, 4), // operand is twice the execution width
+      STRIDE1 =             GENX_ITR_FLAGENUM(4, 5), // horizontal stride must be 1
+    // Modifiers for destination only, 2 bits used
+    SATURATION =            GENX_ITR_FLAGMASK(7, 2),
+    SATURATION_DEFAULT =    GENX_ITR_FLAGENUM(7, 0), // saturation default: not saturated, fp is
+                                                     //  allowed to bale in to saturate inst
+    SATURATION_SATURATE =   GENX_ITR_FLAGENUM(7, 1), // saturated
+    SATURATION_NOSAT =      GENX_ITR_FLAGENUM(7, 2), // fp not allowed to bale in to saturate inst
+    SATURATION_INTALLOWED = GENX_ITR_FLAGENUM(7, 3), // int is allowed to bale in to saturate,
+                                   // because inst cannot overflow so
+                                   // saturation only required on destination
+                                   // truncation
+    // Modifiers for source only, 3 bits used
+    NOIMM =                 GENX_ITR_FLAGVAL(7), // source not allowed to be immediate
+    MODIFIER =              GENX_ITR_FLAGMASK(8, 2),
+    MODIFIER_DEFAULT =      GENX_ITR_FLAGENUM(8, 0), // src modifier default: none
+    MODIFIER_ARITH =        GENX_ITR_FLAGENUM(8, 1), // src modifier: arithmetic
+    MODIFIER_LOGIC =        GENX_ITR_FLAGENUM(8, 2), // src modifier: logic
+    MODIFIER_EXTONLY =      GENX_ITR_FLAGENUM(8, 3), // src modifier: extend only
+    DIRECTONLY =            GENX_ITR_FLAGVAL(10), // indirect region not allowed 
+  };
+  struct ArgInfo {
+    unsigned Info;
+    // Default constructor, used in GenXBaling to construct an ArgInfo that
+    // represents an arg of a non-call instruction.
+    ArgInfo() : Info(GENERAL) {}
+    // Construct from a field read from the intrinsics info table.
+    ArgInfo(unsigned Info) : Info(Info) {}
+    // getCategory : return field category
+    unsigned getCategory() { return Info & CATMASK; }
+    // getLogAlignment : get any special alignment requirement, else 0
+    unsigned getLogAlignment() {
+      if (isGeneral()) {
+        if (Info & GRFALIGNED)
+          return 5;
+        if (Info & OWALIGNED)
+          return 4;
+        return 0;
+      }
+      if (isRaw())
+        return 5;
+      return 0;
+    }
+    // isGeneral : test whether this is a general operand
+    bool isGeneral() { return getCategory() == GENERAL; }
+    bool needsSigned() {
+      if (isGeneral())
+        return Info & SIGNED;
+      if (isRaw())
+        return Info & RAW_SIGNED;
+      return false;
+    }
+    bool needsUnsigned() {
+      if (isGeneral())
+        return Info & UNSIGNED;
+      if (isRaw())
+        return Info & RAW_UNSIGNED;
+      return false;
+    }
+    bool rawNullAllowed() {
+      assert(isRaw());
+      return Info & RAW_NULLALLOWED;
+    }
+    // isArgOrRet : test whether this field has an arg index
+    bool isArgOrRet() {
+      if (isGeneral()) return true;
+      if ((Info & CATMASK) >= FIRST_OPERAND)
+        return true;
+      return false;
+    }
+    // isRealArgOrRet : test whether this field has an arg index, and is
+    // a "real" use of the arg
+    bool isRealArgOrRet() {
+      if (isGeneral()) return true;
+      if ((Info & CATMASK) >= FIRST_REAL_OPERAND)
+        return true;
+      return false;
+    }
+    // getArgCountMin : return minimum number of arguments
+    int getArgCountMin() {
+      assert(getCategory() == ARGCOUNT);
+      return (Info & ARGCOUNTMASK) >> FLAGBASE;
+    }
+    // getArgIdx : return argument index for this field, or -1 for return value
+    //  (assuming isArgOrRet())
+    int getArgIdx() { assert(isArgOrRet()); return (Info & OPNDMASK) - 1; }
+    // getLiteral : for a LITERAL or EXECSIZE field, return the literal value
+    unsigned getLiteral() { return Info & LITMASK; }
+    // isRet : test whether this is the field for the return value
+    //  (assuming isArgOrRet())
+    bool isRet() { return getArgIdx() < 0; }
+    // isRaw : test whether this is a raw arg or return value
+    bool isRaw() { return getCategory() == RAW; }
+    // getSaturation : return saturation info for the arg
+    unsigned getSaturation() { return Info & SATURATION; }
+    // getRestriction : return operand width/region restriction, one of
+    // 0 (no restriction), FIXED4, CONTIGUOUS, TWICEWIDTH
+    unsigned getRestriction() { return Info & RESTRICTION; }
+    // isImmediateDisallowed : test whether immediate disallowed
+    //  (assuming isArgOrRet())
+    bool isImmediateDisallowed() {
+      assert(isArgOrRet());
+      if (isGeneral())
+        return Info & NOIMM;
+      if (isRaw())
+        return true;
+      switch (Info & CATMASK) {
+        case TWOADDR:
+        case PREDICATION:
+        case SURFACE:
+        case SAMPLER:
+        case VME:
+          return true;
+        default: break;
+      }
+      return false;
+    }
+    // getModifier : get what source modifier is allowed
+    unsigned getModifier() {
+      assert(isGeneral() && isArgOrRet() && !isRet());
+      return Info & MODIFIER;
+    }
+  };
+  // GenXIntrinsics::iterator : iterate through the fields
+  class iterator {
+    const DescrElementType *p;
+  public:
+    iterator(const DescrElementType *p) : p(p) {}
+    iterator &operator++() { ++p; if (*p == END) p = 0; return *this; }
+    ArgInfo operator*() { return ArgInfo(*p); }
+    bool operator!=(iterator i) { return p != i.p; }
+  };
+  iterator begin() {
+    assert(isNotNull() && "iterating an intrinsic without info");
+    return iterator(Args);
+  }
+  iterator end() { return iterator(0); }
+  // Construct a GenXIntrinsicInfo for a particular intrinsic
+  GenXIntrinsicInfo(unsigned IntrinId);
+  bool isNull() const { return *getInstDesc() == GenXIntrinsicInfo::END; }
+  bool isNotNull() const { return !isNull(); }
+  // Return instruction description.
+  const DescrElementType *getInstDesc() const { return Args; }
+  // Get the category and modifier for an arg idx
+  ArgInfo getArgInfo(int Idx);
+  // Get the trailing null zone, if any.
+  unsigned getTrailingNullZoneStart(CallInst *CI);
+  // Get the category and modifier for the return value
+  ArgInfo getRetInfo() { return getArgInfo(-1); }
+  // Get bitmap of allowed execution sizes
+  unsigned getExecSizeAllowedBits();
+  // Determine if a predicated destination mask is permitted
+  bool getPredAllowed();
+  // Get The overrided execution size or 0.
+  static unsigned getOverridedExecSize(CallInst *CI,
+                                       const GenXSubtarget *ST = nullptr);
+};
+
+} // namespace llvm
+#endif // ndef GENXINTRINSICS_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLayoutBlocks.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLayoutBlocks.cpp
new file mode 100644
index 000000000000..485635a8268c
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLayoutBlocks.cpp
@@ -0,0 +1,126 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXLayoutBlocks
+/// -------------------
+///
+/// This pass tidies the control flow in the following way:
+///
+/// It reorders blocks to increase fallthrough generally, and specifically
+/// to ensure that SIMD CF goto and join have the required structure: the
+/// "false" successor must be fallthrough and the "true" successor must be
+/// forward. (The '"true" successor must be forward' requirement is a vISA
+/// requirement, because vISA goto/join does not specify JIP, and the
+/// finalizer reconstructs it on this assumption.)
+///
+/// This pass is invoked in ISPC flow to ensure SIMD CF conformance.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_LAYOUTBLOCKS"
+
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXGotoJoin.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "GenXNumbering.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+
+/***********************************************************************
+ * GenXLayoutBlocks pass declaration
+ */
+namespace {
+class GenXLayoutBlocks : public FunctionPass {
+public:
+  static char ID;
+  explicit GenXLayoutBlocks() : FunctionPass(ID) {}
+  virtual StringRef getPassName() const { return "GenX layout blocks"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<GenXModule>();
+    AU.addPreserved<GenXGroupBaling>();
+    AU.addPreserved<GenXLiveness>();
+    AU.addPreserved<GenXNumbering>();
+    AU.addPreserved<FunctionGroupAnalysis>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F);
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O,
+                                  const std::string &Banner) const {
+    return createGenXPrinterPass(O, Banner);
+  }
+};
+} // end anonymous namespace.
+
+char GenXLayoutBlocks::ID = 0;
+namespace llvm {
+void initializeGenXLayoutBlocksPass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXLayoutBlocks, "GenXLayoutBlocks", "GenXLayoutBlocks",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(GenXLayoutBlocks, "GenXLayoutBlocks", "GenXLayoutBlocks",
+                    false, false)
+namespace llvm {
+  FunctionPass *createGenXLayoutBlocksPass() {
+    initializeGenXLayoutBlocksPass(*PassRegistry::getPassRegistry());
+    return new GenXLayoutBlocks;
+  }
+} // namespace llvm
+
+/***********************************************************************
+ * GenXLayoutBlocks::runOnFunction:
+ *    reorder blocks to increase fallthrough,
+ *    and specifically to satisfy the requirements of SIMD control flow
+ */
+bool GenXLayoutBlocks::runOnFunction(Function &F) {
+  if (F.empty())
+    return false;
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  if (LI.empty())
+    LayoutBlocks(F);
+  else
+    LayoutBlocks(F, LI);
+  return true;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp
new file mode 100644
index 000000000000..339cb22e5900
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp
@@ -0,0 +1,2613 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXLegalization
+/// ----------------
+///
+/// GenXLegalization is a function pass that splits vector instructions
+/// up to make execution widths legal, and to ensure that the GRF crossing rules
+/// are satisfied.
+///
+/// This pass makes the LLVM IR closer to legal vISA by
+/// splitting up any instruction that has an illegal vector width (too big or
+/// non power of two) or an illegal region (illegal vstride/width/stride or
+/// illegal GRF crossing).
+///
+/// **IR restriction**: After this pass, LLVM IR represents vISA instructions
+/// with legal execution width and region parameters, and with any particular
+/// instruction's region restrictions adhered to.
+///
+/// The pass uses the instruction baling information to tell which
+/// regions an instruction has. Splitting an instruction and its regions needs
+/// to be done with reference to all the regions at the same time, as they may
+/// need splitting at different points.
+///
+/// For general values, an illegal width instruction is split by
+/// creating narrower instructions, each of which uses a rdregion to extract the
+/// subregion for each source operand, and then uses a wrregion to insert the
+/// resulting subregion into the original destination value. The original
+/// illegal width values survive, and that is OK because a vISA register can
+/// have any vector width.
+///
+/// The pass uses the hasIndirectGRFCrossing feature from GenXSubtarget when
+/// calculating whether a region is legal, or how a region needs to be split, in
+/// the case that the region is indirect.
+///
+/// The legalization pass considers a bale of instructions as a separate
+/// entity which can be split without reference to other bales. This works
+/// because the overhead of splitting, which is an extra rdregion per operand
+/// and an extra wrregion on the result, is pretty much free in that these extra
+/// region accesses are baled in to the split instruction.
+///
+/// There are some cases where we decide we need to unbale an instruction, i.e.
+/// remove it (or rather the subtree of instructions in the bale rooted at it)
+/// from the bale, and then re-start the analysis for the bale. This happens
+/// when there are two conflicting requirements in the bale, for example a main
+/// instruction that needs at least simd4 but a rdregion that can only manage
+/// simd2.
+///
+/// The pass scans backwards through the code, which makes this unbaling a bit
+/// easier. An unbaled instruction will be encountered again a bit later, and
+/// be processed as its own bale.
+///
+/// If a source operand being split is already an rdregion, then that rdregion
+/// is split, so the new split rdregions read from the original rdregion's
+/// input.
+///
+/// Similarly, if the bale is already headed by an wrregion, it is replaced by
+/// the new split wrregions used to join the splits back together.
+///
+/// BitCast is not split in this pass. A non-category-converting BitCast is
+/// always coalesced in GenXCoalescing, so never generates actual code. Thus it
+/// does not matter if it has an illegal size.
+///
+/// Predicate legalization
+/// ^^^^^^^^^^^^^^^^^^^^^^
+///
+/// Predicates (vector of i1) are more complex. A general vISA value can be any
+/// vector width, but a predicate can only be a power of two up to 32. Thus the
+/// actual predicate values need to be split, not just the reads from and writes
+/// to the values.
+///
+/// Furthermore, although it is possible to read and write a region within a
+/// predicate, using H1/H2/Q1..Q4 flags, there are restrictions: the start
+/// offset must be 8 aligned (4 aligned for a select or cmp with 64-bit
+/// operands), and the size must be no more than the misalignment of the start
+/// offset (e.g. for a start offset of 8, the size can be 8 but not 16).
+///
+/// So this pass splits an arbitrary size predicate value (including predicate
+/// phi nodes) into as many as possible 32 bit parts, then descending power of
+/// two parts. For example, a predicate of size 37 is split into 32,4,1.
+///
+/// Then, within each part, a read or write of the predicate can be further
+/// split as long as it fits the restrictions above, e.g. a 32 bit part can be
+/// read/written in 8 or 16 bit subregions.
+///
+/// This is achieved in two steps:
+///
+/// 1. Predicates take part in the main code of GenXLegalization. When deciding
+///    how to split a read or write of a predicate, we determine how the predicate
+///    value will be split into parts (e.g. the 37 split into 32,4,1 example
+///    above), then decides how a part could be subregioned if necessary (e.g.
+///    the 32 could have a 16 aligned 16 bit region, or an 8 aligned 8 bit
+///    region). As well as a maximum, this usually gives a minimum size region.
+///    If the rest of the bale cannot achieve that minimum size, then we unbale
+///    to avoid the problem and restart the analysis of the bale.
+///
+/// 2. Then, fixIllegalPredicates() actually divides the illegally sized
+///    predicate values, including phi nodes. The splitting in the main part of
+///    GenXLegalization ensures that no read or write of a predicate value
+///    crosses a part boundary, so it is straightforward to split the values
+///    into those parts.
+///
+/// This is complicated by the case that the IR before legalization has an
+/// rdpredregion. This typically happens when a CM select has odd size operands
+/// but an i32 mask. Clang codegen bitcasts the i32 mask to v32i1, then does a
+/// shufflevector to extract the correct size predicate. GenXLowering turns the
+/// shufflevector into rdpredregion. The main code in GenXLegalization splits
+/// the rdpredregion into several rdpredregions.
+///
+/// In that case, we cannot guarantee that fixIllegalPredicates will find legal
+/// rdpredregions. For example, suppose the original rdpredregion has a v32i1 as
+/// input, and v13i1 as result. It is determined that the 13 bit predicate will
+/// be split into 8,4,1 parts. The main GenXLegalization code will generate
+/// an rdpredregion from the 32 bit predicate for each part of the 13 bit
+/// predicate. However, the rdpredregion for the 1 bit part is illegal, because
+/// its start offset is not 8 aligned.
+///
+/// We currently do not cope with that (it will probably assert somewhere). If
+/// we do find a need to cope with it, then the illegal rdpredregion will need
+/// to be lowered to bit twiddling code.
+///
+/// Other tasks of GenXLegalization
+/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+///
+/// An additional task of this pass is to lower an any/all intrinsic that is
+/// used anywhere other than as the predicate of a scalar wrregion by inserting
+/// such a scalar wrregion with a byte 0/1 result and then a compare of that
+/// to give an i1.
+///
+/// A further task of this pass is to lower any predicated wrregion where the
+/// value to write is a vector wider than 1 but the predicate is a scalar i1
+/// (other than the value 1, which means unpredicated). It inserts code to splat
+/// the scalar i1 predicate to v16i1 or v32i1. This is really part of lowering,
+/// but we need to do it here because in GenXLowering the value to write might
+/// be wider than 32.
+///
+/// An extra optimization performed in this pass is to transform a move (that
+/// is, a lone wrregion or lone rdregion or a rdregion+wrregion baled together)
+/// with a byte element type into the equivalent short or int move. This saves
+/// the jitter having to split the byte move into even and odd halves. This
+/// optimization needs to be done when baling info is available, so legalization
+/// is a handy place to put it.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_LEGALIZATION"
+
+#include "GenX.h"
+#include "GenXAlignmentInfo.h"
+#include "GenXBaling.h"
+#include "GenXIntrinsics.h"
+#include "GenXRegion.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "KillAnalysis.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+#include <set>
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// Information on a part of a predicate.
+struct PredPart {
+  unsigned Offset;
+  unsigned Size;
+  unsigned PartNum;
+};
+
+// min and max legal size for a predicate split
+struct LegalPredSize {
+  unsigned Min;
+  unsigned Max;
+};
+
+// GenXLegalization : legalize execution widths and GRF crossing
+class GenXLegalization : public FunctionPass {
+  enum { DETERMINEWIDTH_UNBALE = 0, DETERMINEWIDTH_NO_SPLIT = 256 };
+  GenXBaling *Baling = nullptr;
+  const GenXSubtarget *ST = nullptr;
+  ScalarEvolution *SE = nullptr;
+  bool EnableTransformByteMove = true;
+  // Work variables when in the process of splitting a bale.
+  // The Bale being split. (Also info on whether it has FIXED4 and TWICEWIDTH
+  // operands.)
+  Bale B;
+  Use *Fixed4 = nullptr;
+  Use *TwiceWidth = nullptr;
+  // Map from the original instruction to the split one for the current index.
+  std::map<Instruction *, Value *> SplitMap;
+
+  // Consider reading from and writing to the same region in this bale,
+  // bale {
+  //   W1 = rdr(V0, R)
+  //   W2 = op(W1, ...)
+  //   V1 = wrd(V0, W2, R)
+  // }
+  // if splitting the above bale into two bales
+  // bale {
+  //    W1.0 = rdr(V0, R.0)
+  //    W2.0 = op(W1.0, ...)
+  //    V1.0 = wrr(V0, W2.0, R.0)
+  // }
+  // bale {
+  //    W1.1 = rdr(V0, R.1)
+  //    W2.1 = op(W1.1, ...)
+  //    V1.1 = wrr(V1.0, W2.1, R1)
+  // }
+  // V1.0 and V0 are live at the same time. This makes copy-coalescing
+  // fail and also increases rp by the size of V0.
+  //
+  // If we can prove that
+  // (*) rdr(V0, R.1) == rdr(V1.0, R.1) = rdr(wrr(V0, W2.0, R.0), R.1)
+  // then we could split the bale slightly differently:
+  // bale {
+  //    W1.0 = rdr(V0, R.0)
+  //    W2.0 = op(W1.0, ...)
+  //    V1.0 = wrr(V0, W2.0, R.0)
+  // }
+  // bale {
+  //    W1.1 = rdr(V1.0, R.1)
+  //    W2.1 = op(W1.1, ...)
+  //    V1.1 = wrr(V1.0, W2.1, R1)
+  // }
+  // If V0 is killed after this bale, then V1.0, V1.1 and V0
+  // could be coalesced into a single variable. This is the pattern
+  // for in-place operations.
+  //
+  // To satisfy equation (*), it suffices to prove there is no overlap for any
+  // two neighbor subregions. This holds for the following two cases:
+  //  (1) 1D direct regions or indirect regions with single offset
+  //  (2) 2D direct regions with VStride >= Width, or indirect regions with
+  //      single offset.
+  //
+  // While legalizing a bale ends with a g_store instruction, we produce the
+  // following code sequences.
+  // bale {
+  //   V1 = rdr(V0, 0, 32)
+  //   V2 = fadd V1, 1
+  //   store V2, p
+  // }
+  // ===>
+  // bale {
+  //  V1.0 = rdr(V0, 0, 16)
+  //  V2.0 = fadd V1.0, 1
+  //  V3.0 = wrr(load(p), V2.0, 0, 16)
+  //  store V3.0, p
+  // }
+  // bale {
+  //  V1.1 = rdr(V0, 16, 32)
+  //  V2.1 = fadd V1.1, 1
+  //  V3.1 = wrr(load(p), V2.1, 16, 32)
+  //  store V3.1, p
+  // }
+  // The instruction stream looks like:
+  //
+  //  V1.0 = rdr(V0, 0, 16)
+  //  V1.1 = rdr(V0, 16, 32)
+  //  V2.0 = fadd V1.0, 1
+  //  V2.1 = fadd V1.1, 1
+  //  V3.0 = wrr(load(p), V2.0, 0, 16)
+  //  store V3.0, p
+  //  V3.1 = wrr(load(p), V2.1, 16, 32)
+  //  store V3.1, p
+  //
+  // That is, this process does not produce region joins.
+  //
+  enum SplitKind {
+    SplitKind_Normal,      // split bales without propagation.
+    SplitKind_Propagation, // split bales with propagation.
+    SplitKind_GStore       // split bales end with g_store.
+  };
+  SplitKind CurSplitKind = SplitKind_Normal;
+  // Current instruction in loop in runOnFunction, which gets adjusted if that
+  // instruction is erased.
+  Instruction *CurrentInst = nullptr;
+  // Illegally sized predicate values that need splitting at the end of
+  // processing the function.
+  SetVector<Instruction *> IllegalPredicates;
+
+public:
+  static char ID;
+  explicit GenXLegalization() : FunctionPass(ID) { clearBale(); }
+  virtual StringRef getPassName() const {
+    return "GenX execution width and GRF crossing legalization";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunction(Function &F);
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O,
+                                  const std::string &Banner) const {
+    return createGenXPrinterPass(O, Banner);
+  }
+
+private:
+  void clearBale() {
+    B.clear();
+    Fixed4 = nullptr;
+    TwiceWidth = nullptr;
+  }
+  unsigned getExecSizeAllowedBits(Instruction *Inst);
+  bool processInst(Instruction *Inst);
+  bool processBale(Instruction *InsertBefore);
+  bool noSplitProcessing();
+  bool processAllAny(Instruction *Inst, Instruction *InsertBefore);
+  bool processBitCastFromPredicate(Instruction *Inst,
+                                   Instruction *InsertBefore);
+  bool processBitCastToPredicate(Instruction *Inst, Instruction *InsertBefore);
+  unsigned getExecutionWidth();
+  unsigned determineWidth(unsigned WholeWidth, unsigned StartIdx);
+  unsigned determineNonRegionWidth(Instruction *Inst, unsigned StartIdx);
+  LegalPredSize getLegalPredSize(Value *Pred, Type *ElementTy,
+                                 unsigned StartIdx, unsigned RemainingSize = 0);
+  PredPart getPredPart(Value *V, unsigned Offset);
+  Value *splitBale(Value *Last, unsigned StartIdx, unsigned Width,
+                   Instruction *InsertBefore);
+  Value *joinBaleInsts(Value *Last, unsigned StartIdx,
+                       unsigned Width, Instruction *InsertBefore);
+  Value *joinBaleResult(Value *Last, Value *LastSplitInst, unsigned StartIdx,
+                         unsigned Width, Instruction *InsertBefore);
+  Value *joinGStore(Value *Last, BaleInst GStore, BaleInst WrRegion,
+                    unsigned StartIdx, unsigned Width,
+                    Instruction *InserBefore);
+  Value *joinWrRegion(Value *Last, BaleInst BInst, unsigned StartIdx,
+                      unsigned Width, Instruction *InserBefore);
+  Value *joinPredPredWrRegion(Value *Last, BaleInst BInst, unsigned StartIdx,
+                              unsigned Width, Instruction *InserBefore);
+  Value *joinAnyWrRegion(Value *Last, BaleInst BInst, unsigned StartIdx,
+                         unsigned Width, Instruction *InserBefore);
+  Value *splitInst(Value *Last, BaleInst BInst, unsigned StartIdx,
+                   unsigned Width, Instruction *InsertBefore,
+                   const DebugLoc &DL);
+  Value *getSplitOperand(Instruction *Inst, unsigned OperandNum,
+                         unsigned StartIdx, unsigned Size,
+                         Instruction *InsertBefore, const DebugLoc &DL);
+  Instruction *convertToMultiIndirect(Instruction *Inst, Value *LastJoinVal,
+                                      Region *R, Instruction *InsertBefore);
+  Instruction *transformByteMove(Bale *B);
+  Value *splatPredicateIfNecessary(Value *V, Type *ValueToWriteTy,
+                                   Instruction *InsertBefore,
+                                   const DebugLoc &DL);
+  Value *splatPredicateIfNecessary(Value *V, unsigned Width,
+                                   Instruction *InsertBefore,
+                                   const DebugLoc &DL);
+  void eraseInst(Instruction *Inst);
+  void removingInst(Instruction *Inst);
+  void fixIllegalPredicates(Function *F);
+  void fixIntrinsicCalls(Function *F);
+  SplitKind checkBaleSplittingKind();
+};
+
+static const unsigned MaxPredSize = 32;
+
+} // end anonymous namespace
+
+char GenXLegalization::ID = 0;
+namespace llvm {
+void initializeGenXLegalizationPass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXLegalization, "GenXLegalization", "GenXLegalization",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(GenXFuncBaling)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(GenXLegalization, "GenXLegalization", "GenXLegalization",
+                    false, false)
+
+FunctionPass *llvm::createGenXLegalizationPass() {
+  initializeGenXLegalizationPass(*PassRegistry::getPassRegistry());
+  return new GenXLegalization;
+}
+
+void GenXLegalization::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<GenXFuncBaling>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<GenXModule>();
+}
+
+/***********************************************************************
+ * GenXLegalization::runOnFunction : process one function to
+ *    legalize execution width and GRF crossing
+ */
+bool GenXLegalization::runOnFunction(Function &F) {
+  Baling = &getAnalysis<GenXFuncBaling>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  ST = P ? P->getSubtarget() : nullptr;
+  // Check args for illegal predicates.
+  for (Function::arg_iterator fi = F.arg_begin(), fe = F.arg_end(); fi != fe;
+       ++fi) {
+    Argument *Arg = &*fi;
+    if (auto VT = dyn_cast<VectorType>(Arg->getType()))
+      if (VT->getElementType()->isIntegerTy(1))
+        assert(getPredPart(Arg, 0).Size == VT->getNumElements() &&
+               "function arg not allowed to be illegally sized predicate");
+  }
+
+  // TODO. remove this restriction.
+  for (auto &GV : F.getParent()->getGlobalList()) {
+    if (std::any_of(GV.user_begin(), GV.user_end(), [](Value *U) {
+          return isa<LoadInst>(U) || isa<StoreInst>(U);
+        })) {
+      EnableTransformByteMove = false;
+      break;
+    }
+  }
+
+  // Legalize instructions. This does a postordered depth first traversal of the
+  // CFG, and scans backwards in each basic block, to ensure that, if we unbale
+  // anything, it then gets processed subsequently.
+  for (po_iterator<BasicBlock *> i = po_begin(&F.getEntryBlock()),
+                                 e = po_end(&F.getEntryBlock());
+       i != e; ++i) {
+    BasicBlock *BB = *i;
+    // The effect of this loop is that we process the instructions in reverse
+    // order, and we re-process anything inserted before the instruction
+    // being processed. CurrentInst is a field in the GenXLegalization object,
+    // which gets updated if a
+    for (CurrentInst = BB->getTerminator(); CurrentInst;) {
+      // If processInst returns true, re-process the same instruction. This is
+      // used when unbaling.
+      while (processInst(CurrentInst))
+        LLVM_DEBUG(dbgs() << "reprocessing\n");
+      CurrentInst =
+          CurrentInst == &BB->front() ? nullptr : CurrentInst->getPrevNode();
+    }
+  }
+  fixIntrinsicCalls(&F);
+  fixIllegalPredicates(&F);
+  IllegalPredicates.clear();
+
+  return true;
+}
+
+/***********************************************************************
+ * getExecSizeAllowedBits : get bitmap of allowed execution sizes
+ *
+ * Enter:   Inst = main instruction of bale
+ *
+ * Return:  bit N set if execution size 1<<N is allowed
+ *
+ * Most instructions have a minimum width of 1. But some instructions,
+ * such as dp4 and lrp, have a minimum width of 4, and legalization cannot
+ * allow such an instruction to be split to a smaller width.
+ *
+ * This also sets up fields in GenXLegalization: Fixed4 is set to a use
+ * that is a FIXED4 operand, and TwiceWidth is set to a use that is a
+ * TWICEWIDTH operand.
+ */
+unsigned GenXLegalization::getExecSizeAllowedBits(Instruction *Inst) {
+
+  // HW does not support simd16/32 integer div/rem. Here it allows
+  // simd16 but not simdD32, as jitter will split it. This emits simd16
+  // moves not simd8 ones.
+  switch (Inst->getOpcode()) {
+  default:
+    break;
+  case BinaryOperator::SDiv:
+  case BinaryOperator::UDiv:
+  case BinaryOperator::SRem:
+  case BinaryOperator::URem:
+    return ST->emulateIDivRem() ? 0x3f : 0x1f;
+  }
+
+  unsigned ID = GenXIntrinsic::getAnyIntrinsicID(Inst);
+  switch (ID) {
+    case GenXIntrinsic::genx_ssmad:
+    case GenXIntrinsic::genx_sumad:
+    case GenXIntrinsic::genx_usmad:
+    case GenXIntrinsic::genx_uumad:
+    case GenXIntrinsic::genx_ssmad_sat:
+    case GenXIntrinsic::genx_sumad_sat:
+    case GenXIntrinsic::genx_usmad_sat:
+    case GenXIntrinsic::genx_uumad_sat:
+    case Intrinsic::fma:
+      // Do not emit simd32 mad for pre-CNL.
+      return ST->isCNLplus() ? 0x3f : 0x1f;
+    default:
+      break;
+  }
+
+  if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
+    // We have a call instruction, so we can assume it is an intrinsic since
+    // otherwise processInst would not have got as far as calling us as
+    // a non-intrinsic call forces isSplittable() to be false.
+    auto CalledF = CI->getCalledFunction();
+    assert(CalledF);
+    GenXIntrinsicInfo II(GenXIntrinsic::getAnyIntrinsicID(CalledF));
+    // While we have the intrinsic info, we also spot whether we have a FIXED4
+    // operand and/or a TWICEWIDTH operand.
+    for (auto i = II.begin(), e = II.end(); i != e; ++i) {
+      auto ArgInfo = *i;
+      if (ArgInfo.isArgOrRet()) {
+        switch (ArgInfo.getRestriction()) {
+        case GenXIntrinsicInfo::FIXED4:
+          Fixed4 = &CI->getOperandUse(ArgInfo.getArgIdx());
+          break;
+        case GenXIntrinsicInfo::TWICEWIDTH:
+          TwiceWidth = &CI->getOperandUse(ArgInfo.getArgIdx());
+          break;
+        }
+      }
+    }
+    return II.getExecSizeAllowedBits();
+  }
+  return 0x3f;
+}
+
+/***********************************************************************
+ * processInst : process one instruction to legalize execution width and GRF
+ *    crossing
+ *
+ * Return:  true to re-process same instruction (typically after unbaling
+ *          something from it)
+ */
+bool GenXLegalization::processInst(Instruction *Inst) {
+  LLVM_DEBUG(dbgs() << "processInst: " << *Inst << "\n");
+  if (Inst->isTerminator())
+    return false; // ignore terminator
+  // Prepare to insert split code after current instruction.
+  auto InsertBefore = Inst->getNextNode();
+  if (isa<PHINode>(Inst))
+    return false; // ignore phi node
+  // Sanity check for illegal operand type
+  if ((Inst->getType()->getScalarType()->getPrimitiveSizeInBits() == 64) &&
+      !(ST->hasLongLong()))
+    report_fatal_error(
+        "'double' and 'long long' type are not supported by this target");
+  if (ST->isICLLP() || ST->isTGLLP()) {
+    switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+    case GenXIntrinsic::genx_ssad2:
+    case GenXIntrinsic::genx_sssad2add:
+    case GenXIntrinsic::genx_sssad2add_sat:
+    case GenXIntrinsic::genx_susad2add:
+    case GenXIntrinsic::genx_susad2add_sat:
+    case GenXIntrinsic::genx_usad2:
+    case GenXIntrinsic::genx_ussad2add:
+    case GenXIntrinsic::genx_ussad2add_sat:
+    case GenXIntrinsic::genx_uusad2add:
+    case GenXIntrinsic::genx_uusad2add_sat:
+      report_fatal_error("'sad2' and 'sada2' are not supported by this target");
+    default:
+      break;
+    }
+  }
+
+  if (!isa<VectorType>(Inst->getType())) {
+    if (Inst->getOpcode() == Instruction::BitCast &&
+        Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) {
+      // Special processing for bitcast from predicate to scalar int.
+      return processBitCastFromPredicate(Inst, InsertBefore);
+    }
+    switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+    case GenXIntrinsic::genx_all:
+    case GenXIntrinsic::genx_any:
+      return processAllAny(Inst,
+                           InsertBefore); // Special processing for all/any
+    default:
+      break;
+    }
+    if (!isa<StoreInst>(Inst))
+      return false; // no splitting needed for other scalar op.
+  }
+  if (isa<ExtractValueInst>(Inst))
+    return false;
+  if (isa<BitCastInst>(Inst)) {
+    if (Inst->getType()->getScalarType()->isIntegerTy(1)) {
+      // Special processing for bitcast from scalar int to predicate.
+      return processBitCastToPredicate(Inst, InsertBefore);
+    }
+    // Ignore any other bitcast.
+    return false;
+  }
+
+  if (Baling->isBaled(Inst)) {
+    LLVM_DEBUG(dbgs() << "is baled\n");
+    return false; // not head of bale, ignore
+  }
+  // No need to split an llvm.genx.constant with an undef value.
+  switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+  case GenXIntrinsic::genx_constanti:
+  case GenXIntrinsic::genx_constantf:
+    if (isa<UndefValue>(Inst->getOperand(0)))
+      return false;
+    break;
+  default:
+    break;
+  }
+  clearBale();
+  Baling->buildBale(Inst, &B);
+  // Get the main inst from the bale and decide whether it is something we do
+  // not split. If there is no main inst, the bale is splittable.
+  if (auto MainInst = B.getMainInst()) {
+    if (isa<CallInst>(MainInst->Inst)) {
+      // No legalization for inline asm
+      if (cast<CallInst>(MainInst->Inst)->isInlineAsm())
+        return false;
+      unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(MainInst->Inst);
+      switch (IntrinID) {
+      case GenXIntrinsic::not_any_intrinsic:
+        return false; // non-intrinsic call, ignore
+      case GenXIntrinsic::genx_constantpred:
+        break; // these intrinsics can be split
+      default:
+        if (GenXIntrinsicInfo(IntrinID).getRetInfo().getCategory() !=
+            GenXIntrinsicInfo::GENERAL) {
+          // This is not an ALU intrinsic (e.g. cm_add).
+          // We have a non-splittable intrinsic. Such an intrinsic can
+          // have a scalar arg with a baled in rdregion, which does not
+          // need legalizing. It never has a vector arg with a baled in
+          // rdregion. So no legalization needed.
+          return false;
+        }
+        break;
+      }
+    } else if (isa<BitCastInst>(MainInst->Inst)) {
+      // BitCast is not splittable in here. A non-category-converting BitCast
+      // is always coalesced in GenXCoalescing, so never generates actual
+      // code. Thus it does not matter if it has an illegal size.
+      return false;
+    } else if (auto LI = dyn_cast<LoadInst>(MainInst->Inst)) {
+      (void)LI;
+      // Do not split a (global) load as it does not produce code.
+      return false;
+    } else if (isa<ExtractValueInst>(MainInst->Inst)) {
+      // If EV is main than it's related to inline assembly with
+      // multiple outputs, no legalization
+      return false;
+    }
+    // Any other instruction: split.
+  }
+  // Check if it is a byte move that we want to transform into a short/int move.
+  if (EnableTransformByteMove && transformByteMove(&B)) {
+    // Successfully transformed. Run legalization on the new instruction (which
+    // got inserted before the existing one, so will be processed next).
+    LLVM_DEBUG(dbgs() << "done transformByteMove\n");
+    return false;
+  }
+  // Normal instruction splitting.
+  LLVM_DEBUG(dbgs() << "processBale: "; B.print(dbgs()));
+
+  if (B.isGstoreBale() && !B.isGStoreBaleLegal()) {
+#ifdef _DEBUG
+    dbgs() << "processBale: ";
+    B.print(dbgs());
+#endif
+    report_fatal_error("this g_store bale is not supported yet!");
+  }
+
+  return processBale(InsertBefore);
+}
+
+/***********************************************************************
+ * processBale : process one bale to legalize execution width and GRF crossing
+ *
+ * Return:  true to re-process same head of bale
+ */
+bool GenXLegalization::processBale(Instruction *InsertBefore) {
+  // Get the current execution width.
+  unsigned WholeWidth = getExecutionWidth();
+  if (WholeWidth == 1)
+    return false; // No splitting of scalar or 1-vector
+
+  // Check the bale split kind if do splitting.
+  CurSplitKind = checkBaleSplittingKind();
+
+  // We will be generating a chain of joining wrregions. The initial "old
+  // value" input is undef. If the bale is headed by a wrregion or
+  // wrpredpredregion that is being split, code inside splitInst uses the
+  // original operand 0 for split 0 instead.
+  Value *Joined = nullptr;
+  // For bales ending with g_store, joining is not through wrr, but through
+  // g_load and g_store.
+  if (CurSplitKind != SplitKind::SplitKind_GStore)
+    Joined = UndefValue::get(B.getHeadIgnoreGStore()->Inst->getType());
+
+  // Do the splits.
+  for (unsigned StartIdx = 0; StartIdx != WholeWidth;) {
+    // Determine the width of the next split.
+    unsigned Width = determineWidth(WholeWidth, StartIdx);
+    if (Width == DETERMINEWIDTH_UNBALE) {
+      // determineWidth wants us to re-start processing from the head of the
+      // bale, because it did some unbaling. First erase any newly added
+      // instructions.
+      for (;;) {
+        Instruction *Erase = InsertBefore->getPrevNode();
+        if (Erase == B.getHead()->Inst)
+          break;
+        eraseInst(Erase);
+      }
+      return true; // ask to re-start processing
+    }
+    if (Width == DETERMINEWIDTH_NO_SPLIT)
+      return noSplitProcessing(); // no splitting required
+    // Some splitting is required. This includes the case that there will be
+    // only one split (i.e. no splitting really required), but:
+    //  * it includes an indirect rdregion that is converted to multi indirect;
+    // Create the next split.
+    Joined = splitBale(Joined, StartIdx, Width, InsertBefore);
+    StartIdx += Width;
+  }
+  if (!B.endsWithGStore())
+    B.getHead()->Inst->replaceAllUsesWith(Joined);
+  // Erase the original bale.  We erase in reverse order so erasing each one
+  // removes the uses of earlier ones. However we do not erase an instruction
+  // that still has uses; that happens for a FIXED4 operand.
+  InsertBefore = B.getHead()->Inst->getNextNode();
+  for (auto bi = B.rbegin(), be = B.rend(); bi != be; ++bi) {
+    if (bi->Inst->use_empty())
+      eraseInst(bi->Inst);
+    else {
+      // Do not erase this one as it still has a use; it must be a FIXED4
+      // operand so it is used by the new split bales. Instead move it so it
+      // does not get re-processed by the main loop of this pass.
+      removingInst(bi->Inst);
+      bi->Inst->removeFromParent();
+      bi->Inst->insertBefore(InsertBefore);
+      InsertBefore = bi->Inst;
+    }
+  }
+  return false;
+}
+
+/***********************************************************************
+ * noSplitProcessing : processing of a splttable bale in the case
+ *    that it is not split
+ *
+ * Return:  true to re-process same head of bale
+ */
+bool GenXLegalization::noSplitProcessing() {
+  if (auto SI = dyn_cast<SelectInst>(B.getHeadIgnoreGStore()->Inst)) {
+    // Handle the case that a vector select has a scalar condition.
+    SI->setOperand(0,
+                   splatPredicateIfNecessary(SI->getCondition(), SI->getType(),
+                                             SI, SI->getDebugLoc()));
+  }
+  return false;
+}
+
+/***********************************************************************
+ * processAllAny : legalize all/any
+ *
+ * Return:  true to re-process same head of bale
+ */
+bool GenXLegalization::processAllAny(Instruction *Inst,
+                                     Instruction *InsertBefore) {
+  // See if the all/any is already legally sized.
+  Value *Pred = Inst->getOperand(0);
+  unsigned WholeSize = Pred->getType()->getVectorNumElements();
+  if (getPredPart(Pred, 0).Size == WholeSize) {
+    // Already legally sized. We need to check whether it is used just in a
+    // branch or select, possibly via a not; if not we need to convert the
+    // result to a non-predicate then back to a predicate with a cmp, as there
+    // is no way of expressing a non-baled-in all/any in the generated code.
+    if (Inst->hasOneUse()) {
+      auto User = cast<Instruction>(Inst->use_begin()->getUser());
+      if (isNot(User)) {
+        if (!User->hasOneUse())
+          User = nullptr;
+        else
+          User = cast<Instruction>(User->use_begin()->getUser());
+      }
+      if (User && (isa<SelectInst>(User) || isa<BranchInst>(User)))
+        return false;
+    }
+    // Do that conversion.
+    const DebugLoc &DL = Inst->getDebugLoc();
+    auto I16Ty = Type::getInt16Ty(Inst->getContext());
+    auto V1I16Ty = VectorType::get(I16Ty, 1);
+    Region R(V1I16Ty);
+    R.Mask = Inst;
+    auto NewWr = cast<Instruction>(R.createWrRegion(
+        Constant::getNullValue(V1I16Ty), ConstantInt::get(I16Ty, 1),
+        Inst->getName() + ".allany_lowered", InsertBefore, DL));
+    auto NewBC = CastInst::Create(Instruction::BitCast, NewWr, I16Ty,
+                                  NewWr->getName(), InsertBefore);
+    auto NewPred = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, NewBC,
+                                   Constant::getNullValue(I16Ty),
+                                   NewBC->getName(), InsertBefore);
+    NewPred->setDebugLoc(DL);
+    NewWr->setOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum,
+                      UndefValue::get(Inst->getType()));
+    Inst->replaceAllUsesWith(NewPred);
+    NewWr->setOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum, Inst);
+    return false;
+  }
+  // It needs to be split. For each part, we have an all/any on that part, and
+  // use it to do a select on a scalar that keeps track of whether all/any set
+  // bits have been found.
+  unsigned IID = GenXIntrinsic::getAnyIntrinsicID(Inst);
+  Type *I16Ty = Type::getInt16Ty(Inst->getContext());
+  Value *Zero = Constant::getNullValue(I16Ty);
+  Value *One = ConstantInt::get(I16Ty, 1);
+  Value *Result = IID == GenXIntrinsic::genx_all ? One : Zero;
+  const DebugLoc &DL = Inst->getDebugLoc();
+  for (unsigned StartIdx = 0; StartIdx != WholeSize;) {
+    auto PP = getPredPart(Pred, StartIdx);
+    auto Part = Region::createRdPredRegionOrConst(
+        Pred, StartIdx, PP.Size, Pred->getName() + ".split" + Twine(StartIdx),
+        InsertBefore, DL);
+    Module *M = InsertBefore->getParent()->getParent()->getParent();
+    Function *Decl =
+        GenXIntrinsic::getAnyDeclaration(M, IID, Part->getType());
+    Instruction *NewAllAny = nullptr;
+    if (PP.Size != 1)
+      NewAllAny = CallInst::Create(Decl, Part,
+                                   Inst->getName() + ".split" + Twine(StartIdx),
+                                   InsertBefore);
+    else {
+      // Part is v1i1. All we need to do is bitcast it to i1, which does not
+      // generate any code.
+      NewAllAny = CastInst::Create(
+          Instruction::BitCast, Part, Part->getType()->getScalarType(),
+          Inst->getName() + ".split" + Twine(StartIdx), InsertBefore);
+    }
+    NewAllAny->setDebugLoc(DL);
+    SelectInst *Sel = nullptr;
+    if (IID == GenXIntrinsic::genx_all)
+      Sel = SelectInst::Create(NewAllAny, Result, Zero,
+                               Inst->getName() + ".join" + Twine(StartIdx),
+                               InsertBefore);
+    else
+      Sel = SelectInst::Create(NewAllAny, One, Result,
+                               Inst->getName() + ".join" + Twine(StartIdx),
+                               InsertBefore);
+    Sel->setDebugLoc(DL);
+    Result = Sel;
+    StartIdx += PP.Size;
+  }
+  // Add a scalar comparison to get the final scalar bool result.
+  auto Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, Result, Zero,
+                             Inst->getName() + ".joincmp", InsertBefore);
+  // Replace and erase the old all/any.
+  Inst->replaceAllUsesWith(Cmp);
+  eraseInst(Inst);
+  return false;
+}
+
+/***********************************************************************
+ * processBitCastFromPredicate : legalize bitcast from predicate (vector of
+ *    i1) to scalar int
+ */
+bool GenXLegalization::processBitCastFromPredicate(Instruction *Inst,
+                                                   Instruction *InsertBefore) {
+  Value *Pred = Inst->getOperand(0);
+  unsigned SplitWidth = getPredPart(Pred, 0).Size;
+  if (SplitWidth == 0)
+    return false;
+#if _DEBUG
+  unsigned WholeWidth = Pred->getType()->getVectorNumElements();
+  assert(!(WholeWidth % SplitWidth) && "does not handle odd predicate sizes");
+#endif
+  // Bitcast each split predicate into an element of an int vector.
+  // For example, if the split size is 16, then the result is a vector
+  // of i16. Then bitcast that to the original result type.
+  Type *IntTy = Type::getIntNTy(Inst->getContext(), SplitWidth);
+  unsigned NumSplits = Inst->getType()->getPrimitiveSizeInBits() / SplitWidth;
+  if (NumSplits == 1)
+    return false;
+  const DebugLoc &DL = Inst->getDebugLoc();
+  Type *IntVecTy = VectorType::get(IntTy, NumSplits);
+  Value *Result = UndefValue::get(IntVecTy);
+  // For each split...
+  for (unsigned i = 0; i != NumSplits; ++i) {
+    // Bitcast that split of the predicate.
+    auto *NewBitCast =
+        CastInst::Create(Instruction::BitCast,
+                         getSplitOperand(Inst, /*OperandNum=*/0, i * SplitWidth,
+                                         SplitWidth, InsertBefore, DL),
+                         IntTy, Inst->getName() + ".split", InsertBefore);
+    NewBitCast->setDebugLoc(DL);
+    // Write it into the element of the vector.
+    Region R(Result);
+    R.getSubregion(i, 1);
+    Result = R.createWrRegion(Result, NewBitCast,
+                              Inst->getName() + ".join" + Twine(i * SplitWidth),
+                              InsertBefore, DL);
+  }
+  // Bitcast the vector to the original type.
+  auto *NewBitCast =
+      CastInst::Create(Instruction::BitCast, Result, Inst->getType(),
+                       Inst->getName() + ".cast", InsertBefore);
+  NewBitCast->setDebugLoc(DL);
+  // Change uses and erase original.
+  Inst->replaceAllUsesWith(NewBitCast);
+  eraseInst(Inst);
+  return false;
+}
+
+/***********************************************************************
+ * processBitCastToPredicate : legalize bitcast to predicate (vector of
+ *    i1) from scalar int
+ */
+bool GenXLegalization::processBitCastToPredicate(Instruction *Inst,
+                                                 Instruction *InsertBefore) {
+  unsigned WholeWidth = Inst->getType()->getVectorNumElements();
+  unsigned SplitWidth = getPredPart(Inst, 0).Size;
+  assert(!(WholeWidth % SplitWidth) && "does not handle odd predicate sizes");
+  unsigned NumSplits = WholeWidth / SplitWidth;
+  if (NumSplits == 1)
+    return false;
+  // Bitcast the scalar int input to a vector of ints each with a number of
+  // bits matching the predicate split size.
+  const DebugLoc &DL = Inst->getDebugLoc();
+  auto IVTy = VectorType::get(Type::getIntNTy(Inst->getContext(), SplitWidth),
+                              WholeWidth / SplitWidth);
+  auto IntVec = CastInst::Create(Instruction::BitCast, Inst->getOperand(0),
+                                 IVTy, Inst->getName() + ".cast", InsertBefore);
+  IntVec->setDebugLoc(DL);
+  Value *Result = UndefValue::get(Inst->getType());
+  Type *SplitPredTy =
+      VectorType::get(Inst->getType()->getScalarType(), SplitWidth);
+  // For each predicate split...
+  for (unsigned i = 0; i != NumSplits; ++i) {
+    // Get the element of the vector using rdregion.
+    Region R(IntVec);
+    R.getSubregion(i, 1);
+    auto NewRd = R.createRdRegion(
+        IntVec, Inst->getName() + ".rdsplit" + Twine(i), InsertBefore, DL);
+    // Bitcast that element of the int vector to a predicate.
+    auto NewPred =
+        CastInst::Create(Instruction::BitCast, NewRd, SplitPredTy,
+                         Inst->getName() + ".split" + Twine(i), InsertBefore);
+    NewPred->setDebugLoc(DL);
+    // Join into the overall result using wrpredregion.
+    auto NewWr = Region::createWrPredRegion(
+        Result, NewPred, i * SplitWidth, Inst->getName() + ".join" + Twine(i),
+        InsertBefore, DL);
+    // If this is the first wrpredregion, add it to IllegalPredicates so it gets
+    // processed later in fixIllegalPredicates.
+    if (!i)
+      IllegalPredicates.insert(NewWr);
+    Result = NewWr;
+  }
+  // Change uses and erase original.
+  Inst->replaceAllUsesWith(Result);
+  eraseInst(Inst);
+  return false;
+}
+
+/***********************************************************************
+ * getExecutionWidth : get the execution width of the bale
+ *
+ * If there is no wrregion at the head of the bale, then the execution width is
+ * the width of the head. If there is a wrregion or wrpredpredregion, then the
+ * execution width is the width of the subregion input to the wrregion.
+ */
+unsigned GenXLegalization::getExecutionWidth() {
+  BaleInst *Head = B.getHeadIgnoreGStore();
+  Value *Dest = Head->Inst;
+  if (Head->Info.Type == BaleInfo::WRREGION ||
+      Head->Info.Type == BaleInfo::WRPREDREGION ||
+      Head->Info.Type == BaleInfo::WRPREDPREDREGION)
+    Dest = Head->Inst->getOperand(1);
+  VectorType *VT = dyn_cast<VectorType>(Dest->getType());
+  if (!VT)
+    return 1;
+  return VT->getNumElements();
+}
+
+/***********************************************************************
+ * determineWidth : determine width of the next split
+ *
+ * Enter:   WholeWidth = whole execution width of the bale before splitting
+ *          StartIdx = start index of this split
+ *
+ * Return:  width of next split, DETERMINEWIDTH_UNBALE if unbaling occurred,
+ *          DETERMINEWIDTH_NO_SPLIT if no split required
+ *
+ * If this function returns WholeWidth rather than DETERMINEWIDTH_NO_SPLIT, it
+ * means that there is an indirect rdregion that needs to be converted to multi
+ * indirect. This is different to the condition of not needing a split at all,
+ * which causes this function to return DETERMINEWIDTH_NO_SPLIT.
+ */
+unsigned GenXLegalization::determineWidth(unsigned WholeWidth,
+                                          unsigned StartIdx) {
+  // Prepare to keep track of whether an instruction with a minimum width
+  // (e.g. dp4) would be split too small, and whether we need to unbale.
+  unsigned ExecSizeAllowedBits = 0x3f;
+  if (auto Main = B.getMainInst())
+    ExecSizeAllowedBits = getExecSizeAllowedBits(Main->Inst);
+  unsigned MainInstMinWidth =
+      1 << countTrailingZeros(ExecSizeAllowedBits, ZB_Undefined);
+  // Determine the vector width that we need to split into.
+  bool IsReadSameVector = false;
+  unsigned Width = WholeWidth - StartIdx;
+  unsigned PredMinWidth = 1;
+  Value *WrRegionInput = nullptr;
+  auto Head = B.getHeadIgnoreGStore();
+  if (Head->Info.Type == BaleInfo::WRREGION)
+    WrRegionInput =
+        Head->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+  bool MustSplit = false;
+  for (Bale::iterator i = B.begin(), InstWithMinWidth = i, e = B.end(); i != e;
+       ++i) {
+    unsigned ThisWidth = Width;
+    // Determine the width we need for this instruction.
+    switch (i->Info.Type) {
+    case BaleInfo::WRREGION: {
+      bool Unbale = false;
+      Region R(i->Inst, i->Info);
+      if (R.Mask &&
+          !i->Info.isOperandBaled(GenXIntrinsic::GenXRegion::PredicateOperandNum)) {
+        // We have a predicate, and it is not a baled in rdpredregion. (A
+        // baled in rdpredregion is handled when this loop reaches that
+        // instruction.) Get the min and max legal predicate size.
+        auto PredWidths = getLegalPredSize(R.Mask, R.ElementTy, StartIdx);
+        ThisWidth = std::min(ThisWidth, PredWidths.Max);
+        PredMinWidth = PredWidths.Min;
+      }
+      if (PredMinWidth > Width) {
+        // The min predicate size is bigger than the legal size for the rest
+        // of the bale other than the wrregion. Unbale the main instruction.
+        Unbale = true;
+      }
+      // Get the max legal size for the wrregion.
+      ThisWidth = std::min(
+          ThisWidth,
+          R.getLegalSize(
+              StartIdx, false /*Allow2D*/,
+              i->Inst->getOperand(0)->getType()->getVectorNumElements(), ST,
+              &(Baling->AlignInfo)));
+      if (!Unbale && R.Mask && PredMinWidth > ThisWidth) {
+        // The min predicate size (from this wrregion) is bigger than the
+        // legal size for this wrregion. We have to rewrite the wrregion as:
+        //    rdregion of the region out of the old value
+        //    predicated wrregion, which now has a contiguous region
+        //    wrregion (the original wrregion but with no predicate)
+        // then set DETERMINEWIDTH_UNBALE to restart.
+        auto DL = i->Inst->getDebugLoc();
+        auto NewRd = R.createRdRegion(
+            i->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum),
+            i->Inst->getName() + ".separatepred.rd", i->Inst, DL, false);
+        Baling->setBaleInfo(NewRd, BaleInfo(BaleInfo::RDREGION));
+        Region R2(NewRd);
+        R2.Mask = R.Mask;
+        auto NewWr = cast<Instruction>(R2.createWrRegion(
+            NewRd,
+            i->Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum),
+            i->Inst->getName() + ".separatepred.wr", i->Inst, DL));
+        auto NewBI = i->Info;
+        NewBI.clearOperandBaled(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+        Baling->setBaleInfo(NewWr, NewBI);
+        i->Inst->setOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum, NewWr);
+        i->Inst->setOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum,
+                            Constant::getAllOnesValue(R.Mask->getType()));
+        i->Info.clearOperandBaled(GenXIntrinsic::GenXRegion::PredicateOperandNum);
+        i->Info.clearOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+        Baling->setBaleInfo(i->Inst, i->Info);
+        ThisWidth = DETERMINEWIDTH_UNBALE;
+        break;
+      }
+      if (PredMinWidth > ThisWidth) {
+        // The min predicate size (from a select baled into this wrregion) is
+        // bigger than the legal size for this wrregion. Unbale the select.
+        Unbale = true;
+      }
+      if (ThisWidth < MainInstMinWidth) {
+        // The wrregion is split too small for the main instruction. Unbale
+        // the main instruction.
+        Unbale = true;
+      }
+      if (Unbale) {
+        i->Info.clearOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+        Baling->setBaleInfo(i->Inst, i->Info);
+        ThisWidth = DETERMINEWIDTH_UNBALE;
+      }
+      break;
+    }
+    case BaleInfo::RDREGION: {
+      if (i->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum) ==
+          WrRegionInput)
+        IsReadSameVector = true; // See use of this flag below.
+      // Determine the max region width. If this rdregion is baled into a
+      // TWICEWIDTH operand, double the start index and half the resulting
+      // size.
+      Region R(i->Inst, i->Info);
+      unsigned Doubling = TwiceWidth && i->Inst == *TwiceWidth;
+      unsigned ModifiedStartIdx = StartIdx << Doubling;
+      if (Fixed4 && i->Inst == *Fixed4)
+        ModifiedStartIdx = 0;
+      ThisWidth = R.getLegalSize(
+          ModifiedStartIdx, true /*Allow2D*/,
+          i->Inst->getOperand(0)->getType()->getVectorNumElements(), ST,
+          &(Baling->AlignInfo));
+      if (ThisWidth == 1 &&
+          R.Indirect && !R.isMultiIndirect()) {
+        // This is a single indirect rdregion where we failed to make the
+        // valid size any more than one. If possible, increase the valid size
+        // to 4 or 8 on the assumption that we are going to convert it to a
+        // multi indirect.
+        auto NewThisWidth = 1 << genx::log2(R.Width - StartIdx % R.Width);
+        if (NewThisWidth >= 4) {
+          ThisWidth = std::min(NewThisWidth, 8);
+          MustSplit = true;
+        }
+      }
+      ThisWidth >>= Doubling;
+      if (ThisWidth < MainInstMinWidth) {
+        // The rdregion is split too small for the main instruction.
+        // Unbale the rdregion from its user (must be exactly one user as
+        // it is baled). Note that the user is not necessarily the main
+        // inst, it might be a modifier baled in to the main inst.
+        Value::use_iterator UI = i->Inst->use_begin();
+        Instruction *User = cast<Instruction>(UI->getUser());
+        BaleInfo BI = Baling->getBaleInfo(User);
+        BI.clearOperandBaled(UI->getOperandNo());
+        Baling->setBaleInfo(User, BI);
+        ThisWidth = DETERMINEWIDTH_UNBALE;
+      }
+      break;
+    }
+    case BaleInfo::NOTP:
+      // Only process notp
+      // - if predicate is a vector and
+      // - if it does not have rdpredregion baled in.
+      if (!i->Info.isOperandBaled(0) && i->Inst->getType()->isVectorTy()) {
+        // Get the min and max legal predicate size. First get the element type
+        // from the wrregion or select that the notp is baled into.
+        Type *ElementTy = nullptr;
+        auto Head = B.getHeadIgnoreGStore()->Inst;
+        if (Head != i->Inst)
+          ElementTy = Head->getOperand(1)->getType()->getScalarType();
+        auto PredWidths =
+            getLegalPredSize(i->Inst->getOperand(0), ElementTy, StartIdx);
+        // If the min legal predicate size is more than the remaining size in
+        // the predicate that the rdpredregion extracts, ignore it. This results
+        // in an illegal rdpredregion from splitInst, which then has to be
+        // lowered to less efficient code by fixIllegalPredicates. This
+        // situation arises when the original unsplit bale has an odd size
+        // rdpredregion out of a v32i1, from a CM select() where the mask is an
+        // i32.
+        if (PredWidths.Min <= WholeWidth - StartIdx)
+          PredMinWidth = PredWidths.Min;
+        ThisWidth = std::min(ThisWidth, PredWidths.Max);
+      }
+      break;
+    case BaleInfo::RDPREDREGION: {
+      unsigned RdPredStart =
+          cast<ConstantInt>(i->Inst->getOperand(1))->getZExtValue();
+      // Get the min and max legal predicate size.
+      auto PredWidths = getLegalPredSize(
+          i->Inst->getOperand(0), // the input predicate
+          cast<Instruction>(i->Inst->use_begin()->getUser())
+              ->getOperand(1)
+              ->getType()
+              ->getScalarType(), // the wrregion/select element type
+          RdPredStart + StartIdx);
+      // If the min legal predicate size is more than the remaining size in
+      // the predicate that the rdpredregion extracts, ignore it. This results
+      // in an illegal rdpredregion from splitInst, which then has to be
+      // lowered to less efficient code by fixIllegalPredicates. This situation
+      // arises when the original unsplit bale has an odd size rdpredregion
+      // out of a v32i1, from a CM select() where the mask is an i32.
+      if (PredWidths.Min <= WholeWidth - StartIdx)
+        PredMinWidth = PredWidths.Min;
+      ThisWidth = std::min(ThisWidth, PredWidths.Max);
+      break;
+    }
+    case BaleInfo::SHUFFLEPRED: {
+      // If shufflepred is baled with load with channels then it is always legal.
+      if (const BaleInst *BI = B.getMainInst()) {
+        unsigned IID = GenXIntrinsic::getGenXIntrinsicID(BI->Inst);
+        switch (IID) {
+        default:
+          break;
+        case GenXIntrinsic::genx_gather4_scaled2:
+          continue;
+        }
+      }
+
+      // In other case we need to legalize it using rdpredregion.
+      // Probably later rdpredregion will be legalized further.
+      auto *SI = cast<ShuffleVectorInst>(i->Inst);
+      return ShuffleVectorAnalyzer::getReplicatedSliceDescriptor(SI).SliceSize;
+    }
+    case BaleInfo::ADDRADD:
+    case BaleInfo::ADDROR:
+    case BaleInfo::GSTORE:
+      break;
+    default: {
+      ThisWidth = determineNonRegionWidth(i->Inst, StartIdx);
+      Value *Pred = nullptr;
+      if (auto SI = dyn_cast<SelectInst>(i->Inst)) {
+        Pred = SI->getCondition();
+        if (!isa<VectorType>(Pred->getType())) {
+          // For a select with a scalar predicate, the predicate will be
+          // splatted by splatPredicateIfNecessary. We need to limit the legal
+          // width to the max predicate width.
+          ThisWidth = std::min(ThisWidth, MaxPredSize);
+          Pred = nullptr;
+        }
+      } else if (isa<CmpInst>(i->Inst))
+        Pred = i->Inst;
+      if (Pred && isa<VectorType>(Pred->getType())) {
+        // For a select (with a vector predicate) or cmp, we need to take the
+        // predicate into account. Get the min and max legal predicate size.
+        auto PredWidths = getLegalPredSize(
+            Pred, i->Inst->getOperand(1)->getType()->getVectorElementType(),
+            StartIdx);
+        // If the min legal predicate size is more than the remaining size in
+        // the predicate that the rdpredregion extracts, ignore it. This results
+        // in an illegal rdpredregion from splitInst, which then has to be
+        // lowered to less efficient code by fixIllegalPredicates. This
+        // situation arises when the original unsplit bale has an odd size
+        // rdpredregion out of a v32i1, from a CM select() where the mask is an
+        // i32.
+        if (PredWidths.Min <= WholeWidth - StartIdx)
+          PredMinWidth = PredWidths.Min;
+        if (PredMinWidth > Width) {
+          // The min predicate size is bigger than the legal size for the
+          // rest of the bale so far. There must be a rdregion that needs to
+          // be split too much. Unbale it.
+          assert(InstWithMinWidth->Info.Type == BaleInfo::RDREGION);
+          Instruction *RdToUnbale = InstWithMinWidth->Inst;
+          Use *U = &*RdToUnbale->use_begin();
+          auto User = cast<Instruction>(U->getUser());
+          BaleInfo BI = Baling->getBaleInfo(User);
+          BI.clearOperandBaled(U->getOperandNo());
+          Baling->setBaleInfo(User, BI);
+          ThisWidth = DETERMINEWIDTH_UNBALE;
+        }
+        ThisWidth = std::min(ThisWidth, PredWidths.Max);
+      }
+      break;
+    }
+    }
+    if (ThisWidth < Width) {
+      InstWithMinWidth = i;
+      Width = ThisWidth;
+    }
+    if (Width == DETERMINEWIDTH_UNBALE)
+      return DETERMINEWIDTH_UNBALE;
+  }
+  while (!(ExecSizeAllowedBits & Width)) {
+    // This width is disallowed by the main instruction. We have already
+    // dealt with the case where there is a minimum width above; the
+    // code here is for when there is a particular disallowed width
+    // (e.g. bfi disallows width 2 but allows 1). Try a smaller width.
+    assert(Width != 1);
+    Width >>= 1;
+  }
+  if (Width != WholeWidth && IsReadSameVector &&
+      CurSplitKind == SplitKind_Normal) {
+    // Splitting required, and the bale contains a rdregion from the same
+    // vector as the wrregion's old value input, and we're not already
+    // unbaling. Splitting that would result
+    // in the original value of the vector and a new value being live at the
+    // same time, so we avoid it by unbaling the wrregion.  The resulting
+    // code will use an intermediate smaller register for the result of the
+    // main inst before writing that back in to a region of the vector.
+    //
+    // Note that this unbaling is necessary despite pretty much the same
+    // thing being done in second baling in GenXBaling::unbaleBadOverlaps.
+    // Not doing the unbaling here results in code where the split rdregions
+    // and wrregions are interleaved, so the unbaling in
+    // GenXBaling::unbaleBadOverlaps does not actually stop the bad live range
+    // overlap. (This might change if we had a pass to schedule to reduce
+    // register pressure.)
+    auto Head = B.getHeadIgnoreGStore();
+    Head->Info.clearOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+    Baling->setBaleInfo(Head->Inst, Head->Info);
+    LLVM_DEBUG(
+        dbgs()
+        << "GenXLegalization unbaling when rdr and wrr use same vector\n");
+    return DETERMINEWIDTH_UNBALE;
+  }
+  if (Width == WholeWidth && !MustSplit) {
+    // No split required, so return that to the caller, which then just
+    // returns.  However we do not do that if MustSplit is set, because there
+    // is some reason we need to go through splitting code anyway, one of:
+    // 1. there is an rdregion that needs to be converted to multi indirect;
+    // 2. there is an rdpredregion.
+    return DETERMINEWIDTH_NO_SPLIT;
+  }
+
+  // If join is generated after splitting, need to check destination region rule
+  {
+    auto Head = B.getHeadIgnoreGStore();
+    if (Head->Info.Type != BaleInfo::WRREGION &&
+        Head->Info.Type != BaleInfo::WRPREDPREDREGION) {
+      auto VT = cast<VectorType>(Head->Inst->getType());
+      unsigned VecSize = VT->getNumElements();
+      if (VecSize != Width) {
+        if (!VT->getElementType()->isIntegerTy(1)) {
+          Region R(Head->Inst);
+          auto ThisWidth = R.getLegalSize(StartIdx, false /*no 2d for dst*/,
+                                          VecSize, ST, &(Baling->AlignInfo));
+          if (ThisWidth < Width) {
+            Width = ThisWidth;
+          }
+        }
+      }
+    }
+  }
+
+  return Width;
+}
+
+/***********************************************************************
+ * determineNonRegionWidth : determine max valid width of non-region instruction
+ *
+ * Enter:   Inst = the instruction
+ *          StartIdx = start index
+ *
+ * Return:  max valid width
+ */
+unsigned GenXLegalization::determineNonRegionWidth(Instruction *Inst,
+                                                   unsigned StartIdx) {
+  VectorType *VT = dyn_cast<VectorType>(Inst->getType());
+  if (!VT)
+    return 1;
+  unsigned Width = VT->getNumElements() - StartIdx;
+  unsigned BytesPerElement = VT->getElementType()->getPrimitiveSizeInBits() / 8;
+  // Check whether the operand element size is bigger than the result operand
+  // size. Normally we just check operand 0. This won't work on a select, and
+  // we don't need to do the check on a select anyway as its operand and result
+  // type are the same.
+  if (!isa<SelectInst>(Inst)) {
+    unsigned NumOperands = Inst->getNumOperands();
+    if (CallInst *CI = dyn_cast<CallInst>(Inst))
+      NumOperands = CI->getNumArgOperands();
+    if (NumOperands) {
+      assert(isa<VectorType>(Inst->getOperand(0)->getType()) &&
+             "instruction not supported");
+      unsigned InBytesPerElement =
+          cast<VectorType>(Inst->getOperand(0)->getType())
+              ->getElementType()
+              ->getPrimitiveSizeInBits() /
+          8;
+      if (InBytesPerElement > BytesPerElement)
+        BytesPerElement = InBytesPerElement;
+    }
+  }
+  unsigned int TwoGRFWidth = ST ? (2 * ST->getGRFWidth()) : 64;
+  if (BytesPerElement) {
+    // Non-predicate result.
+    if (Width * BytesPerElement > TwoGRFWidth)
+      Width = TwoGRFWidth / BytesPerElement;
+    Width = 1 << genx::log2(Width);
+  } else {
+    // Predicate result. This is to handle and/or/xor/not of predicates; cmp's
+    // def of a predicate is handled separately where this function is called
+    // in determineWidth().
+    Width = getPredPart(Inst, StartIdx).Size;
+  }
+  return Width;
+}
+
+/***********************************************************************
+ * getLegalPredSize : get legal predicate size
+ *
+ * Enter:   Pred = predicate value
+ *          ElementTy = element type, 0 to assume not 64 bit
+ *          StartIdx = start index in that predicate
+ *          RemainingSize = remaining size from StartIdx in whole vector
+ *                          operation being split, or 0 to imply from the
+ *                          number of elements in the type of Pred
+ *
+ * Return:  Min = min legal size
+ *          Max = max legal size
+ */
+LegalPredSize GenXLegalization::getLegalPredSize(Value *Pred, Type *ElementTy,
+                                                 unsigned StartIdx,
+                                                 unsigned RemainingSize) {
+  // Get details of the part containing StartIdx.
+  auto PP = getPredPart(Pred, StartIdx);
+  // Set Min to 8, or 4 if the element type of the operation using the
+  // intrinsic is 64 bit. Doing this ensures that the next split in the same
+  // part is on a legal offset. The offset of a split within a part must be 8
+  // aligned, or 4 aligned if the element type is 64 bit.
+  LegalPredSize Ret;
+  Ret.Min = !ElementTy ? 8 : ElementTy->getPrimitiveSizeInBits() != 64 ? 8 : 4;
+  // Set Max to the remaining size left in this part, rounded down to a power
+  // of two.
+  unsigned LogMax = Log2_32(PP.Size - StartIdx + PP.Offset);
+  // However, Max cannot be any bigger than the misalignment of the offset into
+  // the part. For example. if the offset is 4 or 12, the size must be 4, not 8
+  // or 16.
+  LogMax = std::min(LogMax, findFirstSet(StartIdx - PP.Offset));
+  Ret.Max = 1 << LogMax;
+  // If Min>Max, then we're at the end of that part and we don't need to ensure
+  // that the next split in the same part is legally aligned.
+  Ret.Min = std::min(Ret.Min, Ret.Max);
+  return Ret;
+}
+
+/***********************************************************************
+ * getPredPart : get info on which part of a predicate an index is in
+ *
+ * Enter:   V = a value of predicate type
+ *          Offset = offset to get info on
+ *
+ * Return:  PredPart struct with
+ *            Offset = start offset of the part
+ *            Size = size of the part
+ *            PartNum = part number
+ *
+ * On entry, Offset is allowed to be equal to the total size of V, in which
+ * case the function returns PartNum = the number of parts and Size = 0.
+ *
+ * This function is what determines how an illegally sized predicate is divided
+ * into parts. It is constrained by vISA only allowing a power of two size for
+ * each part. Therefore it divides into zero or more 32 bit parts (currently 16
+ * bit), then descending powers of two to fill up any odd size end.
+ *
+ * These parts correspond to how predicate values in the IR are divided up, not
+ * just how instructions that use or define them get legalized. Thus a
+ * predicate of size 13 actually gets divided into parts of 8,4,1 as vISA
+ * predicate registers P1,P2,P3 (for example).
+ */
+PredPart GenXLegalization::getPredPart(Value *V, unsigned Offset) {
+  unsigned WholeSize = V->getType()->getVectorNumElements();
+  PredPart Ret;
+  if (Offset == WholeSize && !(WholeSize & (MaxPredSize - 1))) {
+    Ret.Offset = Offset;
+    Ret.Size = 0;
+    Ret.PartNum = Offset / MaxPredSize;
+    return Ret;
+  }
+  if ((Offset ^ WholeSize) & -MaxPredSize) {
+    // This is in one of the 32 bit parts.
+    Ret.Offset = Offset & -MaxPredSize;
+    Ret.Size = MaxPredSize;
+    Ret.PartNum = Offset / MaxPredSize;
+    return Ret;
+  }
+  // This is in the odd less-than-32 section at the end.
+  Ret.Offset = WholeSize & -MaxPredSize;
+  Ret.PartNum = WholeSize / MaxPredSize;
+  for (unsigned Pwr2 = MaxPredSize / 2U;; Pwr2 >>= 1) {
+    if (Pwr2 <= Offset - Ret.Offset) {
+      Ret.Offset += Pwr2;
+      ++Ret.PartNum;
+      if (Offset == WholeSize && Ret.Offset == Offset) {
+        Ret.Size = 0;
+        break;
+      }
+    }
+    if (Pwr2 <= WholeSize - Ret.Offset && Pwr2 > Offset - Ret.Offset) {
+      Ret.Size = Pwr2;
+      break;
+    }
+  }
+  return Ret;
+}
+
+/************************************************************************
+ * SplittableInsts : takes Bale and constructs the range of splittable
+ * instructions of this bale
+ *
+ * Splittable are those instructions that later will be split. By current design
+ * it is all instruction except last wrregion or wrregion+gstore.
+ *
+ * Usage: for (auto BI : SplittableInsts(B)), SplittableInst(B).begin(),...
+ */
+class SplittableInsts {
+  Bale::iterator Begin;
+  Bale::iterator End;
+
+public:
+  SplittableInsts(Bale &SomeBale) : Begin(SomeBale.begin()) {
+    auto HeadIt = SomeBale.getHeadIgnoreGStoreIt();
+    // Only WRREGION, WRPREDPREDREGION, GSTORE should be joined, thus the
+    // instructions before them should be split
+    if (HeadIt->Info.Type == BaleInfo::WRREGION ||
+        HeadIt->Info.Type == BaleInfo::WRPREDPREDREGION)
+      End = HeadIt;
+    else {
+      assert(HeadIt->Info.Type != BaleInfo::GSTORE &&
+             "GSTORE must have been considered before");
+      End = SomeBale.end();
+    }
+  }
+  Bale::iterator begin() { return Begin; }
+  Bale::iterator end() { return End; }
+};
+
+/***********************************************************************
+ * joinBaleInsts : create join instructions in bale
+ *                 (2 in case of gstore, 1 - otherwise)
+ */
+Value *GenXLegalization::joinBaleInsts(Value *PrevSliceRes, unsigned StartIdx,
+                                       unsigned Width,
+                                       Instruction *InsertBefore) {
+  assert(SplittableInsts(B).end() != B.end() &&
+         "must have some instructions to join in the bale");
+  if (B.endsWithGStore()) {
+    assert(SplittableInsts(B).end() == B.getPreHeadIt() &&
+           "a bale is considered to have only 1 dst, in case of GSTORE it's "
+           "represented by the last 2 instructions");
+    return joinGStore(PrevSliceRes, *B.getHead(), *B.getPreHead(), StartIdx,
+                      Width, InsertBefore);
+  } else {
+    assert(SplittableInsts(B).end() == B.getHeadIt() &&
+           "a bale is considered to have only 1 dst, in common case it's "
+           "represented by the last instruction");
+    return joinAnyWrRegion(PrevSliceRes, *B.getHead(), StartIdx, Width,
+                           InsertBefore);
+  }
+}
+
+/***********************************************************************
+ * If the last instruction in the created bale is a split instruction,
+ * need to join this result into the overall result with a wrregion or
+ * wrpredregion. Do not generate the join if it is a write into the whole
+ * of the overall result, which can happen when going through the split
+ * code even when no split is required other than conversion to multi
+ * indirect.
+ */
+Value *GenXLegalization::joinBaleResult(Value *PrevSliceRes,
+                                        Value *LastSplitInst, unsigned StartIdx,
+                                        unsigned Width,
+                                        Instruction *InsertBefore) {
+  assert(PrevSliceRes && LastSplitInst && InsertBefore && "wrong arguments");
+  auto Head = B.getHeadIgnoreGStore()->Inst;
+  auto VT = cast<VectorType>(Head->getType());
+  assert(VT->getNumElements() != Width &&
+         "there's no need to join results if they have the proper type");
+  if (VT->getElementType()->isIntegerTy(1)) {
+    auto NewWr = Region::createWrPredRegion(
+        PrevSliceRes, LastSplitInst, StartIdx,
+        LastSplitInst->getName() + ".join" + Twine(StartIdx), InsertBefore,
+        Head->getDebugLoc());
+    // If this is the first wrpredregion into an illegally sized predicate,
+    // save it for processing later. (Only the first one could possibly be
+    // the root of a tree of wrpredregions, and only the roots of
+    // wrpredregion trees need to be in IllegalPredicates.)
+    if (!StartIdx) {
+      auto PredSize = getLegalPredSize(NewWr, nullptr, 0);
+      if (PredSize.Max != NewWr->getType()->getVectorNumElements())
+        IllegalPredicates.insert(NewWr);
+    }
+    return NewWr;
+  } else {
+    Region R(Head);
+    R.Width = R.NumElements = Width;
+    R.Offset = StartIdx * R.ElementBytes;
+    return R.createWrRegion(PrevSliceRes, LastSplitInst,
+                            LastSplitInst->getName() + ".join" +
+                                Twine(StartIdx),
+                            InsertBefore, Head->getDebugLoc());
+  }
+}
+
+/***********************************************************************
+ * splitBale : create one slice of the bale
+ *
+ * Enter:   PrevSliceRes = result of previously created bale slice,
+ *          undef if this is the first one
+ *          StartIdx = element start index for this slice
+ *          Width = number of elements in this slice
+ *          InsertBefore = insert new inst before this point
+ *
+ * Return:  result of this split
+ */
+Value *GenXLegalization::splitBale(Value *PrevSliceRes, unsigned StartIdx,
+                                   unsigned Width, Instruction *InsertBefore) {
+  Value *LastCreatedInst = nullptr;
+  auto SplittableInstsRange = SplittableInsts(B);
+  for (auto BI : SplittableInstsRange)
+    // Split the instruction.
+    SplitMap[BI.Inst] = LastCreatedInst =
+        splitInst(PrevSliceRes, BI, StartIdx, Width, InsertBefore,
+                  BI.Inst->getDebugLoc());
+  if (SplittableInstsRange.end() != B.end())
+    LastCreatedInst =
+        joinBaleInsts(PrevSliceRes, StartIdx, Width, InsertBefore);
+  else {
+    assert(LastCreatedInst && "must have at least some split inst");
+    auto Head = B.getHeadIgnoreGStore()->Inst;
+    if (cast<VectorType>(Head->getType())->getNumElements() != Width)
+      LastCreatedInst = joinBaleResult(PrevSliceRes, LastCreatedInst, StartIdx,
+                                       Width, InsertBefore);
+  }
+  SplitMap.clear();
+  return LastCreatedInst;
+}
+
+// joins both gstore inst and the wrregion which gstore stores
+// more info at joinAnyWrRegion
+Value *GenXLegalization::joinGStore(Value *PrevSliceRes, BaleInst GStore,
+                                    BaleInst WrRegion, unsigned StartIdx,
+                                    unsigned Width, Instruction *InsertBefore) {
+  assert(GStore.Info.Type == BaleInfo::GSTORE && "wrong argument");
+  Value *Op =
+      joinAnyWrRegion(PrevSliceRes, WrRegion, StartIdx, Width, InsertBefore);
+  return new StoreInst(Op, GStore.Inst->getOperand(1), /*volatile*/ true,
+                       InsertBefore);
+}
+
+// specialized join function for wrregion instruction
+// more info at joinAnyWrRegion
+Value *GenXLegalization::joinWrRegion(Value *PrevSliceRes, BaleInst BInst,
+                                      unsigned StartIdx, unsigned Width,
+                                      Instruction *InsertBefore) {
+  assert(BInst.Info.Type == BaleInfo::WRREGION && "wrong argument");
+  Region R(BInst.Inst, BInst.Info);
+  R.getSubregion(StartIdx, Width);
+  if (R.Mask && isa<VectorType>(R.Mask->getType()))
+    R.Mask = getSplitOperand(
+        BInst.Inst, GenXIntrinsic::GenXRegion::PredicateOperandNum, StartIdx,
+        Width, InsertBefore, BInst.Inst->getDebugLoc());
+  // For SplitIdx==0, the old vector value comes from the original
+  // wrregion. Otherwise it comes from the split wrregion created
+  // last time round.
+  Value *In = !StartIdx ? BInst.Inst->getOperand(0) : PrevSliceRes;
+  if (CurSplitKind == SplitKind::SplitKind_GStore && StartIdx != 0) {
+    Instruction *ST = B.getHead()->Inst;
+    assert(isa<StoreInst>(ST));
+    Value *GV = ST->getOperand(1);
+    In = new LoadInst(GV, ".gload", /*volatile*/ true, InsertBefore);
+  }
+  Value *NewWrRegion =
+      R.createWrRegion(In,
+                       getSplitOperand(BInst.Inst, 1, StartIdx, Width,
+                                       InsertBefore, BInst.Inst->getDebugLoc()),
+                       BInst.Inst->getName() + ".join" + Twine(StartIdx),
+                       InsertBefore, BInst.Inst->getDebugLoc());
+  return NewWrRegion;
+}
+
+// specialized join function for wrpredpredregion instruction
+// more info at joinAnyWrRegion
+Value *GenXLegalization::joinPredPredWrRegion(Value *PrevSliceRes,
+                                              BaleInst BInst, unsigned StartIdx,
+                                              unsigned Width,
+                                              Instruction *InsertBefore) {
+  assert(BInst.Info.Type == BaleInfo::WRPREDPREDREGION && "wrong argument");
+  unsigned WrPredStart =
+      cast<ConstantInt>(BInst.Inst->getOperand(2))->getZExtValue();
+  Value *WrPredNewVal = getSplitOperand(
+      BInst.Inst, 1, StartIdx, Width, InsertBefore, BInst.Inst->getDebugLoc());
+  // For SplitIdx==0, the old vector value comes from the original
+  // wrregion. Otherwise it comes from the split wrregion created
+  // last time round.
+  Value *In = !StartIdx ? BInst.Inst->getOperand(0) : PrevSliceRes;
+  // Create the split wrpredpredregion. Note that the mask is passed in
+  // its original unsplit form; the spec of wrpredpredregion is that the
+  // mask is the same size as the result, and the index is used to slice
+  // the mask as well as to determine the slice where the value is written
+  // in the result.
+  return Region::createWrPredPredRegion(
+      In, WrPredNewVal, StartIdx + WrPredStart, BInst.Inst->getOperand(3),
+      BInst.Inst->getName() + ".join" + Twine(StartIdx), InsertBefore,
+      BInst.Inst->getDebugLoc());
+}
+
+/***********************************************************************
+ * joinAnyWrRegion : join any wrregion instruction in the bale
+ *
+ * Enter:   PrevSliceRes = result of previously created bale slice,
+ *          undef if this is the first one
+ *          BInst = the BaleInst to join
+ *          StartIdx = element start index for this slice
+ *          Width = number of elements in this slice
+ *          InsertBefore = insert new inst before this point
+ *
+ * Return:  the new join value. Join value/instruction has original ("illegal")
+ *          width elements. Each bale slice writes its own part of the value.
+ */
+Value *GenXLegalization::joinAnyWrRegion(Value *PrevSliceRes, BaleInst BInst,
+                                         unsigned StartIdx, unsigned Width,
+                                         Instruction *InsertBefore) {
+  switch (BInst.Info.Type) {
+  case BaleInfo::WRREGION:
+    return joinWrRegion(PrevSliceRes, BInst, StartIdx, Width, InsertBefore);
+    break;
+  case BaleInfo::WRPREDPREDREGION:
+    return joinPredPredWrRegion(PrevSliceRes, BInst, StartIdx, Width,
+                                InsertBefore);
+    break;
+  default:
+    llvm_unreachable("unexpected/unsupported instruction");
+  }
+}
+
+/***********************************************************************
+ * splitInst : split an instruction in the bale
+ *
+ * Enter:   PrevSliceRes = result of previous bale slice,
+ *          undef if this is the first one
+ *          BInst = the BaleInst to split
+ *          StartIdx = element start index for this slice
+ *          Width = number of elements in this slice
+ *          InsertBefore = insert new inst before this point
+ *          DL = debug location to give new instruction(s)
+ *
+ * Return:  the new split value
+ *          Split value/instruction has Width elements.
+ */
+Value *GenXLegalization::splitInst(Value *PrevSliceRes, BaleInst BInst,
+                                   unsigned StartIdx, unsigned Width,
+                                   Instruction *InsertBefore,
+                                   const DebugLoc &DL) {
+  switch (BInst.Info.Type) {
+  case BaleInfo::GSTORE:
+  case BaleInfo::WRREGION:
+  case BaleInfo::WRPREDPREDREGION:
+    llvm_unreachable("these instructions must be processed in join functions");
+    break;
+  case BaleInfo::RDREGION: {
+    // Allow for this being a rdregion baled in to a TWICEWIDTH operand.
+    // If it is, double the start index and width.
+    unsigned Doubling = TwiceWidth && BInst.Inst == *TwiceWidth;
+    StartIdx <<= Doubling;
+    Width <<= Doubling;
+    // Get the subregion.
+    Region R(BInst.Inst, BInst.Info);
+    // Check whether this is an indirect operand that was allowed only
+    // because we assumed that we are going to convert it to a multi
+    // indirect.
+    bool ConvertToMulti =
+        R.Indirect && Width != 1 &&
+        R.getLegalSize(
+            StartIdx, true /*Allow2D*/,
+            BInst.Inst->getOperand(0)->getType()->getVectorNumElements(), ST,
+            &(Baling->AlignInfo)) == 1;
+
+    R.getSubregion(StartIdx, Width);
+    // The region to read from. This is normally from the input region baled
+    // in. If this is reading from and writing to the same region and
+    // split progapation is on, then just reading from the last joined value
+    // (but not the initial undef).
+    //
+    Value *OldVal = BInst.Inst->getOperand(0);
+    if (PrevSliceRes && !isa<UndefValue>(PrevSliceRes) &&
+        CurSplitKind == SplitKind_Propagation) {
+      auto Head = B.getHeadIgnoreGStore();
+      if (Head->Info.Type == BaleInfo::WRREGION) {
+        Value *WrRegionInput = Head->Inst->getOperand(0);
+        if (OldVal == WrRegionInput)
+          OldVal = PrevSliceRes;
+      }
+    }
+    if (!ConvertToMulti) {
+      // Not converting to multi indirect.
+      return R.createRdRegion(
+          OldVal, BInst.Inst->getName() + ".split" + Twine(StartIdx),
+          InsertBefore, DL);
+    }
+    // Converting to multi indirect.
+    return convertToMultiIndirect(BInst.Inst, OldVal, &R, InsertBefore);
+  }
+  case BaleInfo::RDPREDREGION: {
+    unsigned RdPredStart =
+        cast<ConstantInt>(BInst.Inst->getOperand(1))->getZExtValue();
+    Value *RdPredInput = BInst.Inst->getOperand(0);
+    return Region::createRdPredRegionOrConst(
+        RdPredInput, RdPredStart + StartIdx, Width,
+        BInst.Inst->getName() + ".split" + Twine(StartIdx), InsertBefore, DL);
+  }
+  case BaleInfo::SHUFFLEPRED: {
+    // If we need to split predication shuffle vector, then we definitely failed to
+    // bale it with channel instruction. In this case we do not need such complicated
+    // predication logic anymore and can fallback to rdpredregions.
+    auto *SI = cast<ShuffleVectorInst>(BInst.Inst);
+    auto RS = ShuffleVectorAnalyzer::getReplicatedSliceDescriptor(SI);
+    assert(RS.SliceSize == Width && "Unexpected width for predicate shuffle split");
+    Value *Pred = SI->getOperand(0);
+    return Region::createRdPredRegionOrConst(
+        Pred, RS.InitialOffset, Width,
+        SI->getName() + ".split" + Twine(StartIdx), InsertBefore, DL);
+  }
+  }
+  // Splitting non-region instruction.
+  assert(!isa<PHINode>(BInst.Inst) && "not expecting to split phi node");
+  if (CastInst *CI = dyn_cast<CastInst>(BInst.Inst)) {
+    Type *CastToTy = VectorType::get(
+        cast<VectorType>(CI->getType())->getElementType(), Width);
+    Instruction *NewInst = CastInst::Create(
+        CI->getOpcode(),
+        getSplitOperand(CI, 0, StartIdx, Width, InsertBefore, DL), CastToTy,
+        CI->getName() + ".split" + Twine(StartIdx), InsertBefore);
+    NewInst->setDebugLoc(DL);
+    return NewInst;
+  }
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BInst.Inst)) {
+    Instruction *NewInst = BinaryOperator::Create(
+        BO->getOpcode(),
+        getSplitOperand(BO, 0, StartIdx, Width, InsertBefore, DL),
+        getSplitOperand(BO, 1, StartIdx, Width, InsertBefore, DL),
+        BO->getName() + ".split" + Twine(StartIdx), InsertBefore);
+    NewInst->setDebugLoc(DL);
+    return NewInst;
+  }
+#if (LLVM_VERSION_MAJOR > 8)
+  if (UnaryOperator *UO = dyn_cast<UnaryOperator>(BInst.Inst)) {
+    Instruction *NewInst = UnaryOperator::Create(
+        UO->getOpcode(),
+        getSplitOperand(UO, 0, StartIdx, Width, InsertBefore, DL),
+        UO->getName() + ".split" + Twine(StartIdx), InsertBefore);
+    NewInst->setDebugLoc(DL);
+    return NewInst;
+  }
+#endif
+  if (CmpInst *CI = dyn_cast<CmpInst>(BInst.Inst)) {
+    Instruction *NewInst = CmpInst::Create(
+        CI->getOpcode(), CI->getPredicate(),
+        getSplitOperand(CI, 0, StartIdx, Width, InsertBefore, DL),
+        getSplitOperand(CI, 1, StartIdx, Width, InsertBefore, DL),
+        CI->getName() + ".split" + Twine(StartIdx), InsertBefore);
+    NewInst->setDebugLoc(DL);
+    return NewInst;
+  }
+  if (auto SI = dyn_cast<SelectInst>(BInst.Inst)) {
+    Value *Selector = getSplitOperand(SI, 0, StartIdx, Width, InsertBefore, DL);
+    Selector = splatPredicateIfNecessary(Selector, Width, InsertBefore, DL);
+    auto Split1 = getSplitOperand(SI, 1, StartIdx, Width, InsertBefore, DL);
+    auto Split2 = getSplitOperand(SI, 2, StartIdx, Width, InsertBefore, DL);
+    auto NewInst = SelectInst::Create(
+        Selector, Split1, Split2, SI->getName() + ".split" + Twine(StartIdx),
+        InsertBefore);
+    NewInst->setDebugLoc(DL);
+    return NewInst;
+  }
+  // Must be a splittable intrinsic.
+  CallInst *CI = dyn_cast<CallInst>(BInst.Inst);
+  assert(CI);
+  auto CalledF = CI->getCalledFunction();
+  assert(CalledF);
+  unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(CalledF);
+  assert(GenXIntrinsic::isAnyNonTrivialIntrinsic(IntrinID));
+  if (IntrinID == GenXIntrinsic::genx_constanti ||
+      IntrinID == GenXIntrinsic::genx_constantf) {
+    // This is the constant loading intrinsic.
+    // We don't need to load the split constants, since a constant value-to-
+    // write operand is valid in the wrregions that will be used to link
+    // the values back together.
+    return getSplitOperand(BInst.Inst, 0, StartIdx, Width, InsertBefore, DL);
+  }
+
+  // Some other splittable intrinsic.
+  SmallVector<Value *, 2> Args;
+  SmallVector<Type *, 2> OverloadedTypes;
+  OverloadedTypes.push_back(
+      VectorType::get(cast<VectorType>(BInst.Inst->getType())->getElementType(),
+                      Width)); // RetTy
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+    Use *U = &CI->getOperandUse(i);
+    if (U == Fixed4) {
+      Args.push_back(CI->getArgOperand(i));
+    } else if (U == TwiceWidth) {
+      // TWICEWIDTH: operand is twice the width of other operand and result
+      Args.push_back(getSplitOperand(BInst.Inst, i, StartIdx * 2, Width * 2,
+                                     InsertBefore, DL));
+    } else
+      Args.push_back(
+          getSplitOperand(BInst.Inst, i, StartIdx, Width, InsertBefore, DL));
+    if (GenXIntrinsic::isOverloadedArg((GenXIntrinsic::ID)IntrinID, i))
+      OverloadedTypes.push_back(Args[i]->getType());
+  }
+  Module *M = InsertBefore->getParent()->getParent()->getParent();
+  Function *Decl =
+      GenXIntrinsic::getAnyDeclaration(M, IntrinID, OverloadedTypes);
+  Instruction *NewInst = CallInst::Create(
+      Decl, Args, CI->getName() + ".split" + Twine(StartIdx), InsertBefore);
+  NewInst->setDebugLoc(DL);
+  return NewInst;
+}
+
+/***********************************************************************
+ * getSplitOperand : get a possibly split operand
+ *
+ * Enter:   Inst = original non-split instruction
+ *          OperandNum = operand number we want
+ *          StartIdx = element start index for this split
+ *          Size = number of elements in this split
+ *          InsertBefore = where to insert any added rdregion
+ *          DL = debug location to give new instruction(s)
+ *
+ * If the requested operand is a constant, it splits the constant.
+ * Otherwise it creates an rdregion from the original operand.
+ */
+Value *GenXLegalization::getSplitOperand(Instruction *Inst, unsigned OperandNum,
+                                         unsigned StartIdx, unsigned Size,
+                                         Instruction *InsertBefore,
+                                         const DebugLoc &DL) {
+  Value *V = Inst->getOperand(OperandNum);
+  if (!isa<VectorType>(V->getType()))
+    return V; // operand not vector, e.g. variable index in region
+  if (auto C = dyn_cast<Constant>(V))
+    return getConstantSubvector(C, StartIdx, Size);
+  // Split a non-constant vector.
+  if (Instruction *OperandInst = dyn_cast<Instruction>(V)) {
+    auto i = SplitMap.find(OperandInst);
+    if (i != SplitMap.end()) {
+      // Operand is another instruction in the bale being split.
+      return i->second;
+    }
+  }
+  // Non-constant operand not baled in.
+  // Create an rdregion for the operand.
+  if (!V->getType()->getScalarType()->isIntegerTy(1)) {
+    Region R(V);
+    R.getSubregion(StartIdx, Size);
+    return R.createRdRegion(V, V->getName() + ".split" + Twine(StartIdx),
+                            InsertBefore, DL);
+  }
+  // Predicate version.
+  return Region::createRdPredRegion(V, StartIdx, Size,
+                                    V->getName() + ".split" + Twine(StartIdx),
+                                    InsertBefore, DL);
+}
+
+/***********************************************************************
+ * convertToMultiIndirect : convert a rdregion into multi-indirect
+ *
+ * Enter:   Inst = original rdregion
+ *          LastJoinVal = the acutal region to read from
+ *          R = region for it, already subregioned if applicable
+ *
+ * Return:  new rdregion instruction (old one has not been erased)
+ */
+Instruction *
+GenXLegalization::convertToMultiIndirect(Instruction *Inst, Value *LastJoinVal,
+                                         Region *R, Instruction *InsertBefore) {
+  assert(!R->is2D() && (R->NumElements == 4 || R->NumElements == 8));
+  Value *Indirect = R->Indirect;
+  assert(Indirect);
+  const DebugLoc &DL = Inst->getDebugLoc();
+
+  // scalar indirect index
+  if (R->Stride == 1 && !R->is2D() && !isa<VectorType>(Indirect->getType()) &&
+      ST->hasIndirectGRFCrossing()) {
+    Instruction *NewInst =
+        R->createRdRegion(LastJoinVal, Inst->getName(), InsertBefore, DL);
+    return NewInst;
+  }
+
+  // 1. Splat the address. (We will get multiple copies of this
+  // instruction, one per split, but they will be CSEd away.)
+  Instruction *SplattedIndirect = CastInst::Create(
+      Instruction::BitCast, Indirect, VectorType::get(Indirect->getType(), 1),
+      Twine(Indirect->getName()) + ".splat", InsertBefore);
+  SplattedIndirect->setDebugLoc(DL);
+  Region AddrR(SplattedIndirect);
+  AddrR.Stride = 0;
+  AddrR.Width = AddrR.NumElements = R->NumElements;
+  SplattedIndirect = AddrR.createRdRegion(
+      SplattedIndirect, SplattedIndirect->getName(), InsertBefore, DL);
+  // 2. Add the constant vector <0,1,2,3,4,5,6,7> to it (adjusted
+  // for stride in bytes).
+  uint16_t OffsetValues[8];
+  for (unsigned i = 0; i != 8; ++i)
+    OffsetValues[i] = i * (R->Stride * R->ElementBytes);
+  Constant *Offsets = ConstantDataVector::get(
+      InsertBefore->getContext(),
+      ArrayRef<uint16_t>(OffsetValues).slice(0, R->NumElements));
+  SplattedIndirect =
+      BinaryOperator::Create(Instruction::Add, SplattedIndirect, Offsets,
+                             SplattedIndirect->getName(), InsertBefore);
+  SplattedIndirect->setDebugLoc(DL);
+  // 3. Create the multi indirect subregion.
+  R->Indirect = SplattedIndirect;
+  R->VStride = R->Stride;
+  R->Stride = 1;
+  R->Width = 1;
+  Instruction *NewInst =
+      R->createRdRegion(LastJoinVal, Inst->getName(), InsertBefore, DL);
+  return NewInst;
+}
+
+/***********************************************************************
+ * transformByteMove : transform a byte move into short or int move
+ *
+ * Enter:   B = bale (not necessarily a byte move)
+ *
+ * Return:  0 if nothing changed, else the new head of bale (ignoring the
+ *          bitcasts inserted either side)
+ *
+ * If the bale is a byte move (a lone wrregion or lone rdregion or
+ * rdregion+wrregion where the element type is byte), and the region parameters
+ * are suitably aligned, we turn it into a short or int move. This saves the
+ * jitter having to split the byte move into an even half and an odd half.
+ *
+ * If the code is modified, it updates bale info.
+ *
+ * This optimization needs to be done when baling info is available, so
+ * legalization is a handy place to put it.
+ */
+Instruction *GenXLegalization::transformByteMove(Bale *B) {
+  auto HeadBI = B->getHead();
+  Instruction *Head = HeadBI->Inst;
+  if (!Head->getType()->getScalarType()->isIntegerTy(8))
+    return nullptr;
+  Instruction *Wr = nullptr, *Rd = nullptr;
+  if (HeadBI->Info.Type == BaleInfo::WRREGION) {
+    Wr = Head;
+    if (HeadBI->Info.isOperandBaled(
+            GenXIntrinsic::GenXRegion::NewValueOperandNum)) {
+      Rd = dyn_cast<Instruction>(
+          Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+      if (!GenXIntrinsic::isRdRegion(Rd))
+        return nullptr;
+    }
+  } else {
+    if (HeadBI->Info.Type != BaleInfo::RDREGION)
+      return nullptr;
+    Rd = Head;
+  }
+  // Now Rd is the rdregion and Wr is the wrregion, and one of them might be 0.
+  if (Rd && !isa<VectorType>(Rd->getType()))
+    return nullptr;
+  if (Wr && !isa<VectorType>(Wr->getOperand(1)->getType()))
+    return nullptr;
+  assert(Rd || Wr);
+  Value *In = Rd ? Rd->getOperand(0) : Wr->getOperand(1);
+  Region WrR;
+  if (Wr) {
+    WrR = Region(Wr, BaleInfo());
+    if (WrR.Stride != 1 || WrR.Indirect || WrR.Mask)
+      return nullptr;
+  } else
+    WrR = Region(Rd); // representing just the result of the rd, not the region
+  Region RdR;
+  if (Rd) {
+    RdR = Region(Rd, BaleInfo());
+    if (RdR.Stride != 1 || RdR.Indirect)
+      return nullptr;
+  } else
+    RdR = Region(Wr->getOperand(0)); // representing just the value being
+                                     // written in to the region
+  unsigned InNumElements = In->getType()->getVectorNumElements();
+  assert(Wr || Rd);
+  unsigned OutNumElements = (Wr ? Wr : Rd)->getType()->getVectorNumElements();
+  unsigned Misalignment = InNumElements | OutNumElements | RdR.NumElements |
+                          RdR.Width | RdR.VStride | RdR.Offset |
+                          WrR.NumElements | WrR.Width | WrR.VStride |
+                          WrR.Offset;
+  if (Misalignment & 1)
+    return nullptr;
+  unsigned LogAlignment = Misalignment & 2 ? 1 : 2;
+  auto InTy =
+      VectorType::get(Type::getIntNTy(Head->getContext(), 8 << LogAlignment),
+                      InNumElements >> LogAlignment);
+  // Create the bitcast of the input if necessary. (We do that even if the input
+  // is constant, on the basis that EarlyCSE will simplify it.)
+  Value *BCIn = nullptr;
+  if (BitCastInst *InCast = dyn_cast<BitCastInst>(In)) {
+    if (InCast->getSrcTy() == InTy)
+      BCIn = InCast->getOperand(0);
+  }
+  if (BCIn == nullptr) {
+    BCIn = CastInst::Create(Instruction::BitCast, In, InTy, "bytemov", Head);
+    cast<CastInst>(BCIn)->setDebugLoc(Head->getDebugLoc());
+  }
+  Value *Val = BCIn;
+  if (Rd) {
+    // Create the new rdregion.
+    RdR.NumElements >>= LogAlignment;
+    RdR.VStride >>= LogAlignment;
+    RdR.Width >>= LogAlignment;
+    auto NewRd = RdR.createRdRegion(Val, "", Head, Rd->getDebugLoc(),
+                                    /*AllowScalar=*/false);
+    NewRd->takeName(Rd);
+    Baling->setBaleInfo(NewRd, BaleInfo(BaleInfo::RDREGION));
+    Val = NewRd;
+  }
+  if (Wr) {
+    // Create the bitcast of the old value of the vector. (Or just reuse
+    // the first bitcast if it is of the same value -- I saw this in
+    // Boxfilter.)
+    Value *BCOld = BCIn;
+    if (In != Wr->getOperand(0)) {
+      Value *OV = Wr->getOperand(0);
+      BCOld = nullptr;
+      auto ResTy = VectorType::get(
+          Type::getIntNTy(Head->getContext(), 8 << LogAlignment),
+          OutNumElements >> LogAlignment);
+      if (BitCastInst *OVCast = dyn_cast<BitCastInst>(OV)) {
+        if (OVCast->getSrcTy() == ResTy)
+          BCOld = OVCast->getOperand(0);
+      }
+      if (BCOld == nullptr) {
+        BCOld =
+            CastInst::Create(Instruction::BitCast, OV, ResTy, "bytemov", Head);
+        cast<CastInst>(BCOld)->setDebugLoc(Wr->getDebugLoc());
+      }
+    }
+    // Create the new wrregion.
+    WrR.NumElements >>= LogAlignment;
+    WrR.VStride >>= LogAlignment;
+    WrR.Width >>= LogAlignment;
+    auto NewWr = cast<Instruction>(
+        WrR.createWrRegion(BCOld, Val, "", Head, Wr->getDebugLoc()));
+    NewWr->takeName(Wr);
+    BaleInfo BI(BaleInfo::WRREGION);
+    if (Rd)
+      BI.setOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+    Baling->setBaleInfo(NewWr, BI);
+    Val = NewWr;
+  }
+
+  bool NeedBC = true;
+  if (Head->hasOneUse()) {
+    auto U = Head->use_begin()->getUser();
+    if (BitCastInst *UBC = dyn_cast<BitCastInst>(U)) {
+      if (UBC->getDestTy() == Val->getType()) {
+        UBC->replaceAllUsesWith(Val);
+        eraseInst(UBC);
+        NeedBC = false;
+      }
+    }
+  }
+  if (NeedBC) {
+    // Create the bitcast back to the original type.
+    auto BCOut = CastInst::Create(Instruction::BitCast, Val, Head->getType(),
+                                  "bytemov", Head);
+    BCOut->setDebugLoc(Head->getDebugLoc());
+    // Replace and erase the original rdregion and wrregion. We do not need
+    // to do anything with their baling info as that is a ValueMap and they get
+    // removed automatically.
+    Head->replaceAllUsesWith(BCOut);
+  }
+  if (Wr)
+    eraseInst(Wr);
+  if (Rd)
+    eraseInst(Rd);
+  // Return the new wrregion if any, else the new rdregion. Do not return
+  // BCOut as it is not part of the bale for the move.
+  assert(dyn_cast<Instruction>(Val));
+  return cast<Instruction>(Val);
+}
+
+/***********************************************************************
+ * splatPredicateIfNecessary : splat a wrregion/select predicate if necessary
+ *
+ * Enter:   V = the predicate
+ *          Width = width it needs to be splatted to
+ *          InsertBefore = where to insert new instructions
+ *          DL = debug loc for new instructions
+ *
+ * Return:  the predicate, possibly a new instruction
+ *
+ * From GenXLegalization onwards, the predicate (mask) in a wrregion must
+ * either be scalar constant 1, or have the same vector width as the value
+ * being written by the wrregion. Similarly for the selector in a vector
+ * select, except that is not allowed to be scalar constant 1.
+ *
+ * It might make more sense to do this in GenXLowering, except that the
+ * predicate might be wider than 32 at that point. So we have to do it here.
+ */
+Value *GenXLegalization::splatPredicateIfNecessary(Value *V,
+                                                   Type *ValueToWriteTy,
+                                                   Instruction *InsertBefore,
+                                                   const DebugLoc &DL) {
+  if (auto VT = dyn_cast<VectorType>(ValueToWriteTy))
+    return splatPredicateIfNecessary(V, VT->getNumElements(), InsertBefore, DL);
+  return V;
+}
+
+Value *GenXLegalization::splatPredicateIfNecessary(Value *V, unsigned Width,
+                                                   Instruction *InsertBefore,
+                                                   const DebugLoc &DL) {
+  if (Width == 1)
+    return V;
+  if (auto C = dyn_cast<Constant>(V))
+    if (C->isAllOnesValue())
+      return V;
+  if (isa<VectorType>(V->getType()))
+    return V;
+  // Round Width up to 16 or 32. (No point in using up a 32 bit predicate
+  // register if we only need 16.)
+  unsigned RoundedWidth = Width > 16 ? 32 : 16;
+  // Use a select to turn the predicate into 0 or -1.
+  auto ITy = Type::getIntNTy(InsertBefore->getContext(), RoundedWidth);
+  auto Sel = SelectInst::Create(
+      V, Constant::getAllOnesValue(ITy), Constant::getNullValue(ITy),
+      InsertBefore->getName() + ".splatpredicate", InsertBefore);
+  Sel->setDebugLoc(DL);
+  // Bitcast that to v16i1 or v32i1 predicate (which becomes a setp
+  // instruction).
+  Instruction *Res = CastInst::Create(
+      Instruction::BitCast, Sel,
+      VectorType::get(Type::getInt1Ty(InsertBefore->getContext()),
+                      RoundedWidth),
+      InsertBefore->getName() + ".splatpredicate", InsertBefore);
+  Res->setDebugLoc(DL);
+  // If the required size is smaller, do an rdpredregion.
+  if (Width == RoundedWidth)
+    return Res;
+  return Region::createRdPredRegionOrConst(
+      Res, 0, Width, Res->getName() + ".rdpredregion", InsertBefore, DL);
+}
+
+/***********************************************************************
+ * eraseInst : erase instruction, updating CurrentInst if we're erasing that
+ */
+void GenXLegalization::eraseInst(Instruction *Inst) {
+  removingInst(Inst);
+  // If the result is a predicate, ensure it is removed from IllegalPredicates,
+  // just in case it is a wrpredregion that was in IllegalPredicates.
+  if (auto VT = dyn_cast<VectorType>(Inst->getType()))
+    if (VT->getElementType()->isIntegerTy(1))
+      IllegalPredicates.remove(Inst);
+  Inst->eraseFromParent();
+}
+
+void GenXLegalization::removingInst(Instruction *Inst) {
+  if (Inst == CurrentInst)
+    CurrentInst = Inst->getNextNode();
+}
+
+/***********************************************************************
+ * fixIllegalPredicates : fix illegally sized predicate values
+ */
+struct StackEntry {
+  Instruction *Wr;     // the wrpredregion this stack entry is for
+  Instruction *Parent; // its parent wrpredregion in the tree
+  SmallVector<Value *, 4> Parts;
+  // Constructor given wrpredregion and parent.
+  StackEntry(Instruction *Wr, Instruction *Parent) : Wr(Wr), Parent(Parent) {}
+};
+
+void GenXLegalization::fixIllegalPredicates(Function *F) {
+  // First fix illegal size predicate phi nodes, replacing each with multiple
+  // phi nodes with rdpredregion on the incomings and wrpredregion on the
+  // result. These rdpredregions and wrpredregions then get removed with other
+  // illegal size predicates in the code below.
+  SmallVector<PHINode *, 4> PhisToErase;
+  for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+    auto BB = &*fi;
+    Instruction *FirstNonPhi = BB->getFirstNonPHI();
+    for (auto Phi = dyn_cast<PHINode>(BB->begin()); Phi;
+         Phi = dyn_cast<PHINode>(Phi->getNextNode())) {
+      if (!Phi->getType()->getScalarType()->isIntegerTy(1))
+        continue;
+      // We have a predicate phi. Get the first part of it, which might show
+      // that we do not need to split it at all.
+      auto VT = dyn_cast<VectorType>(Phi->getType());
+      if (!VT)
+        continue;
+      unsigned WholeSize = VT->getNumElements();
+      auto PP = getPredPart(Phi, 0);
+      if (PP.Size == WholeSize)
+        continue;
+      // We do need to split.
+      Value *Joined = UndefValue::get(Phi->getType());
+      unsigned NumIncoming = Phi->getNumIncomingValues();
+      for (unsigned StartIdx = 0; StartIdx != WholeSize;) {
+        // Create a split phi node.
+        PP = getPredPart(Phi, StartIdx);
+        auto NewPhi = PHINode::Create(
+            VectorType::get(Phi->getType()->getScalarType(), PP.Size),
+            NumIncoming, Phi->getName() + ".split" + Twine(StartIdx), Phi);
+        // Do a rdpredregion for each incoming.
+        for (unsigned ii = 0; ii != NumIncoming; ++ii) {
+          BasicBlock *IncomingBlock = Phi->getIncomingBlock(ii);
+          Value *Incoming = Phi->getIncomingValue(ii);
+          auto NewRd = Region::createRdPredRegionOrConst(
+              Incoming, StartIdx, PP.Size,
+              Incoming->getName() + ".split" + Twine(StartIdx),
+              IncomingBlock->getTerminator(), DebugLoc());
+          NewPhi->addIncoming(NewRd, IncomingBlock);
+        }
+        // Join with previous new phis for this original phi.
+        Joined = Region::createWrPredRegion(Joined, NewPhi, StartIdx,
+                                            Phi->getName() + ".join" +
+                                                Twine(StartIdx),
+                                            FirstNonPhi, DebugLoc());
+        // If that was the first join, add it to the IllegalPredicates list for
+        // processing its tree of wrpredregions below.
+        if (!StartIdx)
+          IllegalPredicates.insert(cast<Instruction>(Joined));
+        StartIdx += PP.Size;
+      }
+      // Replace the original phi and mark it for erasing. Also undef out its
+      // incomings so it doesn't matter what order we do the erases in.
+      auto Undef = UndefValue::get(Phi->getType());
+      for (unsigned ii = 0; ii != NumIncoming; ++ii)
+        Phi->setIncomingValue(ii, Undef);
+      Phi->replaceAllUsesWith(Joined);
+      PhisToErase.push_back(Phi);
+    }
+  }
+  for (auto i = PhisToErase.begin(), e = PhisToErase.end(); i != e; ++i)
+    (*i)->eraseFromParent();
+  // For each entry in IllegalPredicates that is the root of a tree of
+  // wrpredregions...
+  SmallVector<Instruction *, 4> ToErase;
+  for (auto ipi = IllegalPredicates.begin(), ipe = IllegalPredicates.end();
+       ipi != ipe; ++ipi) {
+    std::vector<StackEntry> Stack;
+    auto Root = *ipi;
+    if (GenXIntrinsic::getGenXIntrinsicID(Root->getOperand(0)) ==
+        GenXIntrinsic::genx_wrpredregion)
+      continue; // not root of tree
+    assert(isa<UndefValue>(Root->getOperand(0)) &&
+           "expecting undef input to root of tree");
+    // See if it really is illegally sized.
+    if (getPredPart(Root, 0).Size == Root->getType()->getVectorNumElements())
+      continue;
+    // For traversing the tree, create a stack where each entry represents a
+    // value in the tree, and contains the values of the parts.  Create an
+    // initial entry for the root of the tree.
+    Stack.push_back(StackEntry(Root, nullptr));
+    // Process stack entries.
+    while (!Stack.empty()) {
+      auto Entry = &Stack.back();
+      if (!Entry->Parts.empty()) {
+        // This stack entry has already been processed; we are on the way back
+        // down having processed its children. Just pop the stack entry, and
+        // mark the wrpredregion for erasing. We do not erase it now because it
+        // might be yet to visit in the IllegalPredicates vector.
+        ToErase.push_back(Entry->Wr);
+        Stack.pop_back();
+        continue;
+      }
+      // Populate Parts with the value of each part from the parent.
+      if (!Entry->Parent) {
+        // No parent. All parts are undef.
+        auto Ty = Entry->Wr->getType();
+        unsigned WholeSize = Ty->getVectorNumElements();
+        for (unsigned Offset = 0; Offset != WholeSize;) {
+          auto PP = getPredPart(Entry->Wr, Offset);
+          Entry->Parts.push_back(
+              UndefValue::get(VectorType::get(Ty->getScalarType(), PP.Size)));
+          Offset += PP.Size;
+        }
+      } else {
+        // Inherit from parent.
+        for (auto i = (Entry - 1)->Parts.begin(), e = (Entry - 1)->Parts.end();
+             i != e; ++i)
+          Entry->Parts.push_back(*i);
+      }
+      // For this wrpredregion, determine the part that it writes to, and see
+      // if it is the whole part. (It cannot overlap more than one part,
+      // because getLegalPredSize ensured that all splits were within parts.)
+      unsigned WrOffset =
+          cast<ConstantInt>(Entry->Wr->getOperand(2))->getZExtValue();
+      unsigned WrSize =
+          Entry->Wr->getOperand(1)->getType()->getVectorNumElements();
+      auto PP = getPredPart(Entry->Wr, WrOffset);
+      assert(WrOffset + WrSize <= PP.Offset + PP.Size &&
+             "overlaps multiple parts");
+      Value *Part = Entry->Parts[PP.PartNum];
+      if (WrSize != PP.Size) {
+        // Not the whole part. We need to write into the previous value of this
+        // part.
+        auto NewWr = Region::createWrPredRegion(
+            Part, Entry->Wr->getOperand(1), WrOffset - PP.Offset, "", Entry->Wr,
+            Entry->Wr->getDebugLoc());
+        NewWr->takeName(Entry->Wr);
+        Part = NewWr;
+      } else
+        Part = Entry->Wr->getOperand(1);
+      // Store the new value of this part.
+      Entry->Parts[PP.PartNum] = Part;
+      // Gather uses in rdpredregion.
+      SmallVector<Instruction *, 4> Rds;
+      for (auto ui = Entry->Wr->use_begin(), ue = Entry->Wr->use_end();
+           ui != ue; ++ui) {
+        auto User = cast<Instruction>(ui->getUser());
+        if (GenXIntrinsic::getGenXIntrinsicID(User) ==
+            GenXIntrinsic::genx_rdpredregion)
+          Rds.push_back(User);
+      }
+      // For each rdpredregion, turn it into a read from the appropriate
+      // part.
+      for (auto ri = Rds.begin(), re = Rds.end(); ri != re; ++ri) {
+        Instruction *Rd = *ri;
+        unsigned RdOffset =
+            cast<ConstantInt>(Rd->getOperand(1))->getZExtValue();
+        unsigned RdSize = Rd->getType()->getVectorNumElements();
+        auto PP = getPredPart(Entry->Wr, RdOffset);
+        assert(RdOffset + RdSize <= PP.Offset + PP.Size &&
+               "overlaps multiple parts");
+        Value *Part = Entry->Parts[PP.PartNum];
+        if (RdSize != PP.Size) {
+          // Only reading a subregion of a part.
+          // Assert that the rdpredregion is legal. In fact we will probably
+          // have to cope with an illegal one, by generating code to bitcast
+          // the predicate to a scalar int (or finding code where it is already
+          // bitcast from a scalar int), using bit twiddling to get the
+          // required subregion, and bitcasting back.  I think this situation
+          // will arise where the input to legalization had an odd size
+          // rdpredregion in a wrregion where the input predicate is a v32i1
+          // from an odd size CM select using an i32 as the mask.
+#if _DEBUG
+          if (RdOffset) {
+            unsigned RdMisalignment = 1U << findFirstSet(RdOffset);
+            assert((RdMisalignment >= 8 ||
+                    (RdMisalignment == 4 && Rd->hasOneUse() &&
+                     cast<Instruction>(Rd->use_begin()->getUser())
+                             ->getOperand(1)
+                             ->getType()
+                             ->getScalarType()
+                             ->getPrimitiveSizeInBits() == 64)) &&
+                   !((RdOffset - PP.Offset) % RdSize) &&
+                   "illegal rdpredregion");
+          }
+#endif
+          // Create a new rdpredregion.
+          auto NewRd = Region::createRdPredRegion(
+              Part, RdOffset - PP.Offset, RdSize, "", Rd, Rd->getDebugLoc());
+          NewRd->takeName(Rd);
+          Part = NewRd;
+        }
+        // Replace the original rdpredregion with the value of the part.
+        Rd->replaceAllUsesWith(Part);
+        Rd->eraseFromParent();
+      }
+      // All remaining uses must be wrpredregion. Push them onto the stack.
+      for (auto ui = Entry->Wr->use_begin(), ue = Entry->Wr->use_end();
+           ui != ue; ++ui) {
+        auto User = cast<Instruction>(ui->getUser());
+        assert(GenXIntrinsic::getGenXIntrinsicID(User) ==
+               GenXIntrinsic::genx_wrpredregion &&
+               !ui->getOperandNo() && "expecting only wrpredregion uses");
+        Stack.push_back(StackEntry(User, Entry->Wr));
+      }
+    }
+  }
+  // Erase the old wrpredregions.
+  for (auto i = ToErase.begin(), e = ToErase.end(); i != e; ++i)
+    (*i)->eraseFromParent();
+}
+
+GenXLegalization::SplitKind GenXLegalization::checkBaleSplittingKind() {
+  if (B.endsWithGStore())
+    return SplitKind::SplitKind_GStore;
+
+  auto Head = B.getHeadIgnoreGStore();
+  SplitKind Kind = SplitKind::SplitKind_Normal;
+
+  if (Head->Info.Type == BaleInfo::WRREGION) {
+    Value *WrRegionInput = Head->Inst->getOperand(0);
+    Region R1(Head->Inst, Head->Info);
+    for (auto &I : B) {
+      if (I.Info.Type != BaleInfo::RDREGION)
+        continue;
+      if (I.Inst->getOperand(0) != WrRegionInput)
+        continue;
+      Region R2(I.Inst, I.Info);
+      if (R1 != R2) {
+        // Check if R1 overlaps with R2. Create a new region for R1 as we are
+        // rewriting region offsets if their difference is a constant.
+        Region R(Head->Inst, Head->Info);
+
+        // Analyze dynamic offset difference, but only for a scalar offset.
+        if (R1.Indirect && R2.Indirect) {
+          if (R1.Indirect->getType()->isVectorTy() ||
+              R2.Indirect->getType()->isVectorTy())
+            return SplitKind::SplitKind_Normal;
+
+          // Strip truncation from bitcast followed by a region read.
+          auto stripConv = [](Value *Val) {
+            if (GenXIntrinsic::isRdRegion(Val)) {
+              CallInst *CI = cast<CallInst>(Val);
+              Region R(CI, BaleInfo());
+              if (R.Offset == 0 && R.Width == 1)
+                Val = CI->getOperand(0);
+              if (auto BI = dyn_cast<BitCastInst>(Val))
+                Val = BI->getOperand(0);
+            }
+            return Val;
+          };
+
+          Value *Offset1 = stripConv(R.Indirect);
+          Value *Offset2 = stripConv(R2.Indirect);
+          if (Offset1->getType() == Offset2->getType()) {
+            auto S1 = SE->getSCEV(Offset1);
+            auto S2 = SE->getSCEV(Offset2);
+            auto Diff = SE->getMinusSCEV(S1, S2);
+            assert(R.Indirect);
+            Diff = SE->getTruncateOrNoop(Diff, R.Indirect->getType());
+            if (auto SCC = dyn_cast<SCEVConstant>(Diff)) {
+              ConstantInt *CI = SCC->getValue();
+              int OffsetDiff = std::abs(static_cast<int>(CI->getSExtValue()));
+              R.Offset = 0;
+              R.Indirect = nullptr;
+              R2.Offset = OffsetDiff;
+              R2.Indirect = nullptr;
+            }
+          }
+        }
+
+        // Ignore the mask and adjust both offsets by a common dynamic
+        // value if exists. If the resulting regions do not overlap, then two
+        // original regions do not overlap.
+        R.Mask = nullptr;
+        R2.Mask = nullptr;
+
+        // As both R and R2 have constant offsets, the overlap function
+        // should check their footprints accurately.
+        if (R.overlap(R2))
+          return SplitKind::SplitKind_Normal;
+        Kind = SplitKind::SplitKind_Propagation;
+        continue;
+      }
+
+      // (1) 1D direct regions or indirect regions with single offset.
+      // (2) 2D direct regions with VStride >= Width, or indirect regions with
+      //     single offset.
+      bool IsMultiAddr = R1.Indirect && R1.Indirect->getType()->isVectorTy();
+      if (!R1.is2D()) {
+        if (IsMultiAddr)
+          return SplitKind::SplitKind_Normal;
+        Kind = SplitKind::SplitKind_Propagation;
+      } else {
+        if (R1.VStride < (int)R1.Width || IsMultiAddr)
+          return SplitKind::SplitKind_Normal;
+        Kind = SplitKind::SplitKind_Propagation;
+      }
+    }
+  }
+
+  return Kind;
+}
+
+// This function deals with intrinsic calls with special restrictions.
+// - Certain intrinsic calls should be placed in the entry blocks:
+//     llvm.genx.predifined.surface
+//
+void GenXLegalization::fixIntrinsicCalls(Function *F) {
+  auto PF = F->getParent()->getFunction("llvm.genx.predefined.surface");
+  if (!PF)
+    return;
+
+  // Collect all calls to PF in this function.
+  std::map<int64_t, std::vector<Instruction *>> Calls;
+  for (auto U : PF->users()) {
+    if (auto UI = dyn_cast<CallInst>(U)) {
+      BasicBlock *BB = UI->getParent();
+      if (BB->getParent() != F)
+        continue;
+      if (auto CI = dyn_cast<ConstantInt>(UI->getOperand(0))) {
+        int64_t Arg = CI->getSExtValue();
+        Calls[Arg].push_back(UI);
+      }
+    }
+  }
+
+  BasicBlock *EntryBB = &F->getEntryBlock();
+  Instruction *InsertPos = &*EntryBB->getFirstInsertionPt();
+
+  for (auto I = Calls.begin(), E = Calls.end(); I != E; ++I) {
+    Instruction *EntryDef = nullptr;
+    for (auto Inst : I->second) {
+      if (Inst->getParent() == EntryBB) {
+        EntryDef = Inst;
+        break;
+      }
+    }
+
+    // No entry definition found, then clone one.
+    if (EntryDef == nullptr) {
+      EntryDef = I->second.front()->clone();
+      EntryDef->insertBefore(InsertPos);
+    } else
+      EntryDef->moveBefore(InsertPos);
+
+    // Now replace all uses with this new definition.
+    for (auto Inst : I->second) {
+      std::vector<Instruction *> WorkList{Inst};
+      while (!WorkList.empty()) {
+        Instruction *CurI = WorkList.back();
+        WorkList.pop_back();
+
+        for (auto UI = CurI->use_begin(); UI != CurI->use_end();) {
+          Use &U = *UI++;
+          // Skip if this use just comes from EntryDef.
+          if (EntryDef == U.get())
+            continue;
+          // All uses of this PHI will be replaced as well.
+          if (auto PHI = dyn_cast<PHINode>(U.getUser()))
+            WorkList.push_back(PHI);
+          U.set(EntryDef);
+        }
+        if (CurI->use_empty())
+          CurI->eraseFromParent();
+      }
+    }
+  }
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveRanges.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveRanges.cpp
new file mode 100644
index 000000000000..0c772f3ba69e
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveRanges.cpp
@@ -0,0 +1,215 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXLiveRanges
+/// --------------
+///
+/// GenXLiveRanges calculates the actual live range information (the segments)
+/// on the LiveRange object for each value. See the comment at the top of
+/// GenXLiveness.h for details of how the live range information works. This
+/// pass calls GenXLiveness::buildLiveRange to do the work for each value.
+///
+/// The LiveRange object for each value already existed before this pass, as it
+/// was created by GenXCategory. In the case of a value that we can now see does
+/// not want a LiveRange, because it is an Instruction baled in to something,
+/// we erase the LiveRange here.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_LIVERANGES"
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXIntrinsics.h"
+#include "GenXLiveness.h"
+#include "GenXNumbering.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+class GenXLiveRanges : public FunctionGroupPass {
+  FunctionGroup *FG;
+  GenXBaling *Baling;
+  GenXLiveness *Liveness;
+public:
+  static char ID;
+  explicit GenXLiveRanges() : FunctionGroupPass(ID) { }
+  virtual StringRef getPassName() const { return "GenX live ranges analysis"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunctionGroup(FunctionGroup &FG);
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const
+  { return createGenXGroupPrinterPass(O, Banner); }
+
+private:
+  void buildLiveRanges();
+
+  bool isPredefinedVariable(Value *) const;
+};
+
+} // end anonymous namespace
+
+namespace llvm { void initializeGenXLiveRangesPass(PassRegistry &); }
+char GenXLiveRanges::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXLiveRanges, "GenXLiveRanges", "GenXLiveRanges", false, false)
+INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_DEPENDENCY(GenXNumbering)
+INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis)
+INITIALIZE_PASS_END(GenXLiveRanges, "GenXLiveRanges", "GenXLiveRanges", false, false)
+
+FunctionGroupPass *llvm::createGenXLiveRangesPass()
+{
+  initializeGenXLiveRangesPass(*PassRegistry::getPassRegistry());
+  return new GenXLiveRanges();
+}
+
+void GenXLiveRanges::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  FunctionGroupPass::getAnalysisUsage(AU);
+  AU.addRequired<GenXGroupBaling>();
+  AU.addRequired<GenXLiveness>();
+  AU.addRequired<GenXNumbering>();
+  AU.addRequired<FunctionGroupAnalysis>();
+  AU.setPreservesAll();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run the liveness analysis for this FunctionGroup
+ */
+bool GenXLiveRanges::runOnFunctionGroup(FunctionGroup &ArgFG)
+{
+  FG = &ArgFG;
+  Baling = &getAnalysis<GenXGroupBaling>();
+  Liveness = &getAnalysis<GenXLiveness>();
+  Liveness->setBaling(Baling);
+  Liveness->setNumbering(&getAnalysis<GenXNumbering>());
+  // Build the live ranges.
+  Liveness->buildSubroutineLRs();
+  buildLiveRanges();
+#ifndef NDEBUG
+  // Check we don't have any leftover empty live ranges. If we do, it means
+  // that a pass between GenXCategory and here has erased a value and failed
+  // to erase its LiveRange, or alternatively this pass has failed to erase
+  // the LiveRange for a value that does not need it because it is a baled
+  // in instruction.
+  for (GenXLiveness::iterator i = Liveness->begin(), e = Liveness->end(); i != e; ++i) {
+    LiveRange *LR = i->second;
+    assert(LR->size()); // Check the LR has at least one segment.
+  }
+#endif // ndef NDEBUG
+  return false;
+}
+
+/***********************************************************************
+ * isPredefinedVariable : check if it's tranlated into predefined
+ * variables in vISA.
+ */
+bool GenXLiveRanges::isPredefinedVariable(Value *V) const {
+  switch (GenXIntrinsic::getGenXIntrinsicID(V)) {
+  case GenXIntrinsic::genx_predefined_surface:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
+/***********************************************************************
+ * buildLiveRanges : build live ranges for all args and instructions
+ */
+void GenXLiveRanges::buildLiveRanges()
+{
+  // Build live ranges for global variables;
+  for (auto &G : FG->getModule()->globals())
+    Liveness->buildLiveRange(&G);
+  for (auto i = FG->begin(), e = FG->end(); i != e; ++i) {
+    Function *Func = *i;
+    // Build live ranges for args.
+    for (auto fi = Func->arg_begin(), fe = Func->arg_end(); fi != fe; ++fi)
+      Liveness->buildLiveRange(&*fi);
+    if (i != FG->begin() && !Func->getReturnType()->isVoidTy()) {
+      // Build live range(s) for unified return value.
+      Liveness->buildLiveRange(Liveness->getUnifiedRet(Func));
+    }
+    // Build live ranges for code.
+    for (Function::iterator fi = Func->begin(), fe = Func->end(); fi != fe; ++fi) {
+      BasicBlock *BB = &*fi;
+      for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) {
+        Instruction *Inst = &*bi;
+        // Skip building live range for instructions
+        // - has no destination
+        // - is already baled, or
+        // - is predefined variable in vISA.
+        if (!Inst->getType()->isVoidTy() && !Baling->isBaled(Inst) &&
+            !isPredefinedVariable(Inst)) {
+          // Instruction is not baled in to anything.
+          // First check if the result is unused and it is an intrinsic whose
+          // result is marked RAW_NULLALLOWED. If so, don't create a live range,
+          // so no register gets allocated.
+          if (Inst->use_empty()) {
+            unsigned IID = GenXIntrinsic::getAnyIntrinsicID(Inst);
+            switch (IID) {
+              case GenXIntrinsic::not_any_intrinsic:
+              case GenXIntrinsic::genx_rdregioni:
+              case GenXIntrinsic::genx_rdregionf:
+              case GenXIntrinsic::genx_wrregioni:
+              case GenXIntrinsic::genx_wrregionf:
+              case GenXIntrinsic::genx_wrconstregion:
+                break;
+              default: {
+                  GenXIntrinsicInfo::ArgInfo AI
+                      = GenXIntrinsicInfo(IID).getRetInfo();
+                  if (AI.isRaw() && AI.rawNullAllowed()) {
+                    // Unused RAW_NULLALLOWED result.
+                    Liveness->eraseLiveRange(Inst);
+                    continue;
+                  }
+                  break;
+                }
+            }
+          }
+          // Build its live range.
+          Liveness->buildLiveRange(Inst);
+        } else {
+          // Instruction is baled in to something. Erase its live range so the
+          // register allocator does not try and allocate it something.
+          Liveness->eraseLiveRange(Inst);
+        }
+      }
+    }
+  }
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.cpp
new file mode 100644
index 000000000000..ea4d871f2038
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.cpp
@@ -0,0 +1,1872 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// GenXLiveness is an analysis that contains the liveness information for the
+// values in the code. See the comment at the top of GenXLiveness.h for further
+// details.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_LIVENESS"
+
+#include "GenXLiveness.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXIntrinsics.h"
+#include "GenXNumbering.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "vc/GenXOpts/Utils/RegCategory.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvmWrapper/IR/InstrTypes.h"
+
+#include <unordered_set>
+
+using namespace llvm;
+using namespace genx;
+
+char GenXLiveness::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXLiveness, "GenXLiveness", "GenXLiveness", false, false)
+INITIALIZE_PASS_END(GenXLiveness, "GenXLiveness", "GenXLiveness", false, false)
+
+FunctionGroupPass *llvm::createGenXLivenessPass()
+{
+  initializeGenXLivenessPass(*PassRegistry::getPassRegistry());
+  return new GenXLiveness();
+}
+
+void GenXLiveness::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  FunctionGroupPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : do nothing
+ */
+bool GenXLiveness::runOnFunctionGroup(FunctionGroup &ArgFG)
+{
+  clear();
+  FG = &ArgFG;
+  return false;
+}
+
+/***********************************************************************
+ * clear : clear the GenXLiveness
+ */
+void GenXLiveness::clear()
+{
+  while (!LiveRangeMap.empty()) {
+    LiveRange *LR = LiveRangeMap.begin()->second;
+    for (auto i = LR->value_begin(), e = LR->value_end(); i != e; ++i) {
+      SimpleValue V = *i;
+      LiveRangeMap.erase(V);
+    }
+    delete LR;
+  }
+  FG = 0;
+  delete CG;
+  CG = 0;
+  for (auto i = UnifiedRets.begin(), e = UnifiedRets.end(); i != e; ++i)
+    i->second->deleteValue();
+  UnifiedRets.clear();
+  UnifiedRetToFunc.clear();
+  ArgAddressBaseMap.clear();
+}
+
+/***********************************************************************
+ * setLiveRange : add a SimpleValue to a LiveRange
+ *
+ * This:
+ * 1. adds the SimpleValue to the LiveRange's value list;
+ * 2. sets the SimpleValue's entry in the map to point to the LiveRange.
+ */
+void GenXLiveness::setLiveRange(SimpleValue V, LiveRange *LR)
+{
+  assert(LiveRangeMap.find(V) == LiveRangeMap.end() && "Attempting to set LiveRange for Value that already has one");
+  LR->addValue(V);
+  LiveRangeMap[V] = LR;
+  LR->setAlignmentFromValue(V);
+}
+
+/***********************************************************************
+ * setAlignmentFromValue : set a live range's alignment from a value
+ */
+void LiveRange::setAlignmentFromValue(SimpleValue V)
+{
+  Type *Ty = IndexFlattener::getElementType(
+        V.getValue()->getType(), V.getIndex());
+  if (Ty->isPointerTy())
+    Ty = Ty->getPointerElementType();
+  unsigned SizeInBits = Ty->getScalarType()->getPrimitiveSizeInBits();
+  if (auto VT = dyn_cast<VectorType>(Ty))
+    SizeInBits *= VT->getNumElements();
+  unsigned LogAlign = Log2_32(SizeInBits) - 3;
+  // Set max alignment to GRF
+  if (LogAlign > 5)
+    LogAlign = 5;
+  setLogAlignment(LogAlign);
+}
+
+/***********************************************************************
+ * rebuildCallGraph : rebuild GenXLiveness's call graph
+ */
+void GenXLiveness::rebuildCallGraph()
+{
+  delete CG;
+  CG = new CallGraph(FG);
+  CG->build(this);
+}
+
+/***********************************************************************
+ * buildSubroutineLRs : build the subroutine LRs
+ *
+ * If the FunctionGroup has subroutines, then each one (each Function other
+ * than the head one) gets a "subroutine LR", giving the live range
+ * of the whole subroutine plus any other subroutines it can call.
+ * Then, when building a real live range later, if it goes over a call,
+ * we can add the subroutine LR.
+ *
+ * The subroutine LR has weak liveness, as that's what we want to add to
+ * anything live over a call to the subroutine.
+ */
+void GenXLiveness::buildSubroutineLRs()
+{
+  if (FG->size() == 1)
+    return; // no subroutines
+  // Build a call graph for the FunctionGroup. It is acyclic because there is
+  // no recursion.
+  rebuildCallGraph();
+  // Depth-first walk the graph to propagate live ranges upwards.
+  visitPropagateSLRs(FG->getHead());
+}
+
+/***********************************************************************
+ * visitPropagateSLRs : visit a callgraph node to propagate subroutine LR
+ *
+ * This is recursive.
+ */
+LiveRange *GenXLiveness::visitPropagateSLRs(Function *F)
+{
+  LiveRange *LR = getOrCreateLiveRange(F);
+  // Add a segment for just this function.
+  LR->push_back(Segment(Numbering->getNumber(F),
+      Numbering->getNumber(F->back().getTerminator()) + 1, Segment::WEAK));
+  // For each child...
+  CallGraph::Node *N = CG->getNode(F);
+  for (auto i = N->begin(), e = N->end(); i != e; ++i) {
+    // Visit the child to calculate its LR.
+    LiveRange *ChildLR = visitPropagateSLRs(i->Call->getCalledFunction());
+    // Merge it into ours.
+    LR->addSegments(ChildLR);
+  }
+  LR->sortAndMerge();
+  return LR;
+}
+
+/***********************************************************************
+ * buildLiveRange : build live range for one value (arg or non-baled inst)
+ *
+ * For a struct value, each element's live range is built separately, even
+ * though they are almost identical. They are not exactly identical,
+ * differing at the def if it is the return value of a call, and at a use
+ * that is a call arg.
+ */
+void GenXLiveness::buildLiveRange(Value *V)
+{
+  auto ST = dyn_cast<StructType>(V->getType());
+  if (!ST) {
+    buildLiveRange(SimpleValue(V));
+    return;
+  }
+  for (unsigned i = 0, e = IndexFlattener::getNumElements(ST); i != e; ++i)
+    buildLiveRange(SimpleValue(V, i));
+}
+
+/***********************************************************************
+ * buildLiveRange : build live range for one SimpleValue
+ *
+ * rebuildLiveRange : rebuild live range for a LiveRange struct
+ *
+ * The BBs[] array, one entry per basic block, is temporarily used here to
+ * store the live range for the value within that block. We start by
+ * registering the short live range for the definition, then, for each use,
+ * create a live range in the use's block then recursively scan back
+ * through predecessors until we meet a block where there is already a
+ * live range. This is guaranteed to terminate because of the dominance
+ * property of SSA.
+ *
+ * See Appel "Modern Compiler Implementation in C" 19.6.
+ *
+ * rebuildLiveRange can be called from later passes to rebuild the segments
+ * for a particular live range. If used after coalescing, the live range might
+ * have more than one value, in which case segments are added for each value
+ * and then merged. Thus we assume that, after whatever code change a pass made
+ * to require rebuilding the live range, the coalesced values can still be
+ * validly coalesced, without having any way of checking that.
+ *
+ */
+LiveRange *GenXLiveness::buildLiveRange(SimpleValue V)
+{
+  LiveRange *LR = getOrCreateLiveRange(V);
+  rebuildLiveRange(LR);
+  return LR;
+}
+
+void GenXLiveness::rebuildLiveRange(LiveRange *LR)
+{
+  LR->getOrDefaultCategory();
+  LR->Segments.clear();
+  for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi)
+    rebuildLiveRangeForValue(LR, *vi);
+  LR->sortAndMerge();
+}
+
+void GenXLiveness::rebuildLiveRangeForValue(LiveRange *LR, SimpleValue SV)
+{
+  Value *V = SV.getValue();
+
+  // This value is a global variable. Its live range is the entire kernel.
+  if (auto GV = getUnderlyingGlobalVariable(V)) {
+    (void)GV;
+    LR->push_back(0, Numbering->getLastNumber());
+    return;
+  }
+
+  std::map<BasicBlock *, Segment> BBRanges;
+  if (auto Func = isUnifiedRet(V)) {
+    // This value is the unified return value of the function Func. Its live
+    // range is from the call to where its post-copy would go just afterwards
+    // for each call site, also from the site of the pre-copy to the return
+    // instruction.
+    for (auto ui = Func->use_begin(), ue = Func->use_end(); ui != ue; ++ui) {
+      if (auto CI = dyn_cast<CallInst>(ui->getUser()))
+      LR->push_back(Numbering->getNumber(CI),
+          Numbering->getRetPostCopyNumber(CI, SV.getIndex()));
+    }
+    for (auto fi = Func->begin(), fe = Func->end(); fi != fe; ++fi)
+      if (auto RI = dyn_cast<ReturnInst>(fi->getTerminator()))
+        LR->push_back(Numbering->getRetPreCopyNumber(RI, SV.getIndex()),
+            Numbering->getNumber(RI));
+    return;
+  }
+
+  // Mark the value as live and then almost immediately dead again at the
+  // point where it is defined.
+  unsigned StartNum = 0, EndNum = 0;
+  Function *Func = 0;
+  auto Arg = dyn_cast<Argument>(V);
+  BasicBlock *BB = nullptr;
+  if (Arg) {
+    Func = Arg->getParent();
+    StartNum = Numbering->getNumber(Func);
+    EndNum = StartNum + 1;
+    BB = &Func->front();
+  } else if (auto Phi = dyn_cast<PHINode>(V)) {
+    // Phi node. Treat as defined at the start of the block.
+    EndNum = Numbering->getNumber(Phi) + 1;
+    BB = Phi->getParent();
+    StartNum = Numbering->getNumber(BB);
+    // For a phi node, we also need to register an extra little live range at
+    // the end of each predecessor, from where we will insert a copy to the
+    // end. This is done lower down in this function.
+  } else {
+    StartNum = Numbering->getNumber(V);
+    auto Inst = cast<Instruction>(V);
+    BB = Inst->getParent();
+    auto CI = dyn_cast<CallInst>(V);
+    if (CI) {
+      if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(V)) {
+        // For the return value from a call, move the definition point to the ret
+        // post-copy slot after the call, where the post-copy will be inserted if
+        // it fails to be coalesced with the function's unified return value.
+        StartNum = Numbering->getRetPostCopyNumber(CI, SV.getIndex());
+      }
+    }
+    EndNum = StartNum + 1;
+    if (CI && getTwoAddressOperandNum(CI) >= 0) {
+      // Two address op. Move the definition point one earlier, to where
+      // GenXCoalescing will need to insert a copy if coalescing fails.
+      --StartNum;
+    }
+  }
+  BBRanges[BB] = Segment(StartNum, EndNum);
+  // The stack for predecessors that need to be processed:
+  std::vector<BasicBlock *> Stack;
+  // Process each use.
+  for (Value::use_iterator i = V->use_begin(), e = V->use_end();
+      i != e; ++i) {
+    BasicBlock *BB = nullptr;
+    Instruction *user = cast<Instruction>(i->getUser());
+    unsigned Num;
+    if (PHINode *Phi = dyn_cast<PHINode>(user)) {
+      // Use in a phi node. We say that the use is where the phi copy will be
+      // placed in the predecessor block.
+      BB = Phi->getIncomingBlock(*i);
+      Num = Numbering->getPhiNumber(Phi, BB);
+    } else {
+      // Normal use.
+      // For live range purposes, an instruction is considered to be at the
+      // same place as the head of its bale. We need to use getBaleHead to
+      // ensure that we consider it to be there.
+      Instruction *UserHead = Baling->getBaleHead(user);
+      BB = UserHead->getParent();
+      Num = Numbering->getNumber(UserHead);
+      if (auto CI = dyn_cast<IGCLLVM::CallInst>(user)) {
+        if (CI->isInlineAsm() || CI->isIndirectCall())
+          Num = Numbering->getNumber(UserHead);
+        else {
+        switch (GenXIntrinsic::getAnyIntrinsicID(CI)) {
+          case GenXIntrinsic::not_any_intrinsic:
+            // Use as a call arg. We say that the use is at the arg pre-copy
+            // slot, where the arg copy will be inserted in coalescing. This
+            // assumes that the copies will be in the same order as args in the
+            // call, with struct elements in order too.
+            Num = Numbering->getArgPreCopyNumber(CI, i->getOperandNo(),
+                                                 SV.getIndex());
+            break;
+          default:
+            if (getTwoAddressOperandNum(CI) == (int)i->getOperandNo()) {
+              // The use is the two address operand in a two address op. Move
+              // the use point one earlier, to where GenXCoalescing will need
+              // to insert a copy if coalescing fails. If there is any other
+              // use of this value in the same bale, that will not have its use
+              // point one number earlier. The unnecessary interference that
+              // would cause is fixed in the way that twoAddrInterfere()
+              // detects interference.
+              --Num;
+            }
+            break;
+          case GenXIntrinsic::genx_simdcf_goto:
+            // Use in a goto. Treat it as at the branch, as GenXVisaFuncWriter
+            // writes the goto just before the branch, after any intervening IR.
+            Num = Numbering->getNumber(CI->getParent()->getTerminator());
+            break;
+          }
+        }
+      } else if (auto RI = dyn_cast<ReturnInst>(user)) {
+        // Use in a return. We say that the use is where the ret value
+        // pre-copy will be inserted in coalescing. This assumes that the
+        // copies will be in the same order as the struct elements in the
+        // return value.
+        Num = Numbering->getRetPreCopyNumber(RI, SV.getIndex());
+      }
+    }
+    auto BBRange = &BBRanges[BB];
+    if (BBRange->getEnd()) {
+      // There is already a live range in this block. Extend it if
+      // necessary. No need to scan back from here, so we're done with
+      // this use.
+      if (BBRange->getEnd() < Num)
+        BBRange->setEnd(Num);
+      continue;
+    }
+    // Add a new live range from the start of this block, and remember the
+    // range of blocks that contain a live range (so we don't have to scan
+    // all of them at the end).
+    *BBRange = Segment(Numbering->getNumber(BB), Num);
+    // Push this block's predecessors onto the stack.
+    // (A basic block's predecessors are those blocks containing a
+    // TerminatorInst that uses the basic block.)
+	for (Value::use_iterator i = BB->use_begin(), e = BB->use_end();
+	  i != e; ++i) {
+	  Instruction *TI = dyn_cast<Instruction>(i->getUser());
+      assert(TI);
+	  if (TI->isTerminator())
+	    Stack.push_back(TI->getParent());
+	}
+    // Process stack until empty.
+    while (Stack.size()) {
+      BB = Stack.back();
+      Stack.pop_back();
+      BBRange = &BBRanges[BB];
+      auto BBNum = Numbering->getBBNumber(BB);
+      if (BBRange->getEnd()) {
+        // There is already a live range in this block. Extend it to the end.
+        // No need to scan back from here.
+        BBRange->setEnd(BBNum->EndNumber);
+        continue;
+      }
+      // Add a new live range through the whole of this block, and remember the
+      // range of blocks that contain a live range (so we don't have to scan
+      // all of them at the end).
+      BBRange->setStartEnd(Numbering->getNumber(BB), BBNum->EndNumber);
+      // Push this block's predecessors onto the stack.
+      // (A basic block's predecessors are those blocks containing a
+      // TerminatorInst that uses the basic block.)
+	  for (Value::use_iterator i = BB->use_begin(), e = BB->use_end();
+ 	    i != e; ++i) {
+		Instruction *TI = dyn_cast<Instruction>(i->getUser());
+        assert(TI);
+		if (TI->isTerminator())
+		  Stack.push_back(TI->getParent());
+	  }
+    }
+  }
+  // Now we can build the live range.
+  for (auto bri = BBRanges.begin(), bre = BBRanges.end(); bri != bre; ++bri) {
+    auto BBRange = &bri->second;
+    LR->push_back(*BBRange);
+  }
+  if (PHINode *Phi = dyn_cast<PHINode>(V)) {
+    // For a phi node, we also need to register an extra little live range at
+    // the end of each predecessor, from where we will insert a copy to the
+    // end.
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+      auto Pred = Phi->getIncomingBlock(i);
+      auto BBNum = Numbering->getBBNumber(Pred);
+      LR->push_back(Segment(Numbering->getPhiNumber(Phi, Pred),
+            BBNum->EndNumber, Segment::PHICPY));
+    }
+  }
+  LR->sortAndMerge();
+  if (CG) {
+    // Check if the live range crosses any call instruction. If so, add the
+    // appropriate subroutine live range.
+    bool NeedSort = false;
+    auto N = CG->getNode(Func);
+    for (auto i = N->begin(), e = N->end(); i != e; ++i) {
+      auto E = &*i;
+      // See if this call is in a segment of the LR.
+      auto Seg = LR->find(E->Number);
+      if (Seg != LR->end() && Seg->getStart() <= E->Number && Seg->getEnd() > E->Number) {
+        // Yes it is. Merge the subroutine LR of the callee into our LR.
+        if (!E->Call->getCalledFunction()->hasFnAttribute("CMStackCall"))
+          LR->addSegments(getLiveRange(E->Call->getCalledFunction()));
+        NeedSort = true;
+      }
+    }
+    if (NeedSort)
+      LR->sortAndMerge();
+  }
+  if (Arg) {
+    // For a function arg, for each call site, add a segment from the arg
+    // pre-copy site, the point just before the call at which it will be copied
+    // into, up to the call.  We assume that any copies before the call
+    // inserted by coalescing will be in the obvious order of args and elements
+    // within args.
+    Function *F = Arg->getParent();
+    if (*FG->begin() != F) { // is a subroutine
+      for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) {
+        if (auto CI = dyn_cast<CallInst>(ui->getUser())) {
+        LR->push_back(
+            Numbering->getArgPreCopyNumber(CI, Arg->getArgNo(), SV.getIndex()),
+            Numbering->getNumber(CI));
+        }
+      }
+    }
+  }
+}
+
+void GenXLiveness::removeBale(Bale &B) {
+  for (auto bi = B.begin(), be = B.end(); bi != be; ++bi)
+    removeValue(bi->Inst);
+}
+
+/***********************************************************************
+ * removeValue : remove the supplied value from its live range, and delete
+ *               the range if it now has no values
+ *
+ * removeValueNoDelete : same, but do not delete the LR if it is now
+ *               valueless
+ *
+ * Calling this with a value that does not have a live range is silently
+ * ignored.
+ */
+void GenXLiveness::removeValue(Value *V)
+{
+  for (unsigned i = 0, e = IndexFlattener::getNumElements(V->getType()); i != e; ++i)
+    removeValue(SimpleValue(V, i));
+}
+
+void GenXLiveness::removeValue(SimpleValue V)
+{
+  LiveRange *LR = removeValueNoDelete(V);
+  if (LR && !LR->Values.size()) {
+    // V was the only value in LR. Remove LR completely.
+    delete LR;
+  }
+}
+
+LiveRange *GenXLiveness::removeValueNoDelete(SimpleValue V)
+{
+  LiveRangeMap_t::iterator i = LiveRangeMap.find(V);
+  if (i == LiveRangeMap.end())
+    return nullptr;
+  LiveRange *LR = i->second;
+  LiveRangeMap.erase(i);
+  // Remove V from LR.
+  unsigned j;
+  for (j = 0; LR->Values[j].get() != V; ++j) {
+    assert(j != LR->Values.size());
+  }
+  if (&LR->Values[j] != &LR->Values.back())
+    LR->Values[j] = LR->Values.back();
+  LR->Values.pop_back();
+  return LR;
+}
+
+/***********************************************************************
+ * removeValuesNoDelete : remove all values from the live range, but do not
+ *        delete the LR
+ */
+void GenXLiveness::removeValuesNoDelete(LiveRange *LR)
+{
+  for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi)
+    LiveRangeMap.erase(*vi);
+  LR->value_clear();
+}
+
+/***********************************************************************
+ * replaceValue : update liveness such that NewVal has OldVal's live range,
+ *    and OldVal does not have one at all.
+ */
+void GenXLiveness::replaceValue(Value *OldVal, Value *NewVal)
+{
+  for (unsigned i = 0, e = IndexFlattener::getNumElements(OldVal->getType());
+      i != e; ++i)
+    replaceValue(SimpleValue(OldVal, i), SimpleValue(NewVal, i));
+}
+
+void GenXLiveness::replaceValue(SimpleValue OldVal, SimpleValue NewVal)
+{
+  LiveRangeMap_t::iterator i = LiveRangeMap.find(OldVal);
+  assert(i != LiveRangeMap.end());
+  LiveRange *LR = i->second;
+  LiveRangeMap.erase(i);
+  LiveRangeMap[NewVal] = LR;
+  unsigned j = 0;
+  assert(!LR->Values.empty());
+  for (j = 0; LR->Values[j].get() != OldVal; ++j)
+    assert(j != LR->Values.size());
+  LR->Values[j] = NewVal;
+}
+
+/***********************************************************************
+ * getOrCreateLiveRange : get live range for a value, creating if necessary
+ */
+LiveRange *GenXLiveness::getOrCreateLiveRange(SimpleValue V)
+{
+  LiveRangeMap_t::iterator i = LiveRangeMap.insert(
+      LiveRangeMap_t::value_type(V, 0)).first;
+  LiveRange *LR = i->second;
+  if (!LR) {
+    // Newly created map entry. Create the LiveRange for it.
+    LR = new LiveRange;
+    LR->Values.push_back(V);
+    i->second = LR;
+    LR->setAlignmentFromValue(V);
+  }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  // Give the Value a name if it doesn't already have one.
+  if (!V.getValue()->getName().size()) {
+    std::string NameBuf;
+    StringRef Name = "arg";
+    if (auto Inst = dyn_cast<Instruction>(V.getValue())) {
+      unsigned IID = GenXIntrinsic::getAnyIntrinsicID(V.getValue());
+      if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IID)) {
+        // For an intrinsic call, use the intrinsic name after the
+        // final period.
+        NameBuf = GenXIntrinsic::getAnyName(IID, None);
+        Name = NameBuf;
+        size_t Period = Name.rfind('.');
+        if (Period != StringRef::npos)
+          Name = Name.slice(Period + 1, Name.size());
+      } else
+        Name = Inst->getOpcodeName();
+    }
+    V.getValue()->setName(Name);
+  }
+#endif
+  return LR;
+}
+
+LiveRange *GenXLiveness::getOrCreateLiveRange(SimpleValue V, unsigned Cat, unsigned LogAlign) {
+  auto LR = getOrCreateLiveRange(V);
+  LR->setCategory(Cat);
+  LR->setLogAlignment(LogAlign);
+  return LR;
+}
+
+/***********************************************************************
+ * eraseLiveRange : get rid of live range for a Value, possibly multiple
+ *  ones if it is a struct value
+ */
+void GenXLiveness::eraseLiveRange(Value *V)
+{
+  auto ST = dyn_cast<StructType>(V->getType());
+  if (!ST) {
+    eraseLiveRange(SimpleValue(V));
+    return;
+  }
+  for (unsigned i = 0, e = IndexFlattener::getNumElements(ST); i != e; ++i)
+    eraseLiveRange(SimpleValue(V, i));
+}
+
+/***********************************************************************
+ * eraseLiveRange : get rid of live range for a Value, if any
+ */
+void GenXLiveness::eraseLiveRange(SimpleValue V)
+{
+  auto LR = getLiveRangeOrNull(V);
+  if (LR)
+    eraseLiveRange(LR);
+}
+
+/***********************************************************************
+ * eraseLiveRange : get rid of the specified live range, and remove its
+ *        values from the map
+ */
+void GenXLiveness::eraseLiveRange(LiveRange *LR)
+{
+  for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi)
+    LiveRangeMap.erase(*vi);
+  delete LR;
+}
+
+/***********************************************************************
+ * getLiveRangeOrNull : get live range for value, or 0 if none
+ *
+ * The returned LiveRange pointer is valid only until the next time the
+ * live ranges are modified, including the case of coalescing.
+ */
+const LiveRange *GenXLiveness::getLiveRangeOrNull(SimpleValue V) const
+{
+  auto i = LiveRangeMap.find(V);
+  if (i == LiveRangeMap.end())
+    return nullptr;
+  return i->second;
+}
+
+LiveRange *GenXLiveness::getLiveRangeOrNull(SimpleValue V)
+{
+  return const_cast<LiveRange*>(static_cast<const GenXLiveness*>(this)->getLiveRangeOrNull(V));
+}
+
+/***********************************************************************
+ * getLiveRange : get live range for value
+ *
+ * The returned LiveRange pointer is valid only until the next time the
+ * live ranges are modified, including the case of coalescing.
+ */
+LiveRange *GenXLiveness::getLiveRange(SimpleValue V)
+{
+  LiveRange *LR = getLiveRangeOrNull(V);
+  assert(LR && "no live range found");
+  return LR;
+}
+
+/***********************************************************************
+ * getUnifiedRet : get/create unified return value for a function
+ *
+ * Returns already created unified value, or creates new one
+ * if there was no such.
+ */
+Value *GenXLiveness::getUnifiedRet(Function *F)
+{
+  auto RetIt = UnifiedRets.find(F);
+  if (RetIt == UnifiedRets.end())
+    return createUnifiedRet(F);
+  return RetIt->second;
+}
+
+/***********************************************************************
+ * createUnifiedRet : create unified return value for a function
+ *
+ * To allow all returns in a function and all results of calls to that
+ * function to use the same register, we have a dummy "unified return
+ * value".
+ *
+ * Cannot be called on a function with void return type.
+ *
+ * This also creates the LiveRange for the unified return value, or
+ * multiple ones if it is struct type, and sets the category to the same as in
+ * one of the return instructions.
+ */
+Value *GenXLiveness::createUnifiedRet(Function *F) {
+  assert(!F->isDeclaration() && "must be a function definition");
+  assert(UnifiedRets.find(F) == UnifiedRets.end() &&
+         "Unified ret must not have been already created");
+  Type *Ty = F->getReturnType();
+  assert(!Ty->isVoidTy());
+  auto URet = genx::createUnifiedRet(Ty, "", F->getParent());
+  // Find some return inst.
+  ReturnInst *Ret = nullptr;
+  for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi)
+    if ((Ret = dyn_cast<ReturnInst>(fi->getTerminator())))
+      break;
+  assert(Ret && "must find return instruction");
+  Value *RetVal = Ret->getOperand(0);
+  // Use the categories of its operand to set the categories of the unified
+  // return value.
+  for (unsigned StructIdx = 0, NumElements = IndexFlattener::getNumElements(Ty);
+       StructIdx != NumElements; ++StructIdx) {
+    int Cat = getOrCreateLiveRange(SimpleValue(RetVal, StructIdx))
+                  ->getOrDefaultCategory();
+    SimpleValue SV(URet, StructIdx);
+    getOrCreateLiveRange(SV)->setCategory(Cat);
+  }
+
+  UnifiedRets[F] = URet;
+  UnifiedRetToFunc[URet] = F;
+  return URet;
+}
+
+/***********************************************************************
+ * isUnifiedRet : test whether a value is a unified return value
+ *
+ * A unified ret value is a call instruction that is
+ * not attached to any BB, and is in the UnifiedRetFunc map.
+ */
+Function *GenXLiveness::isUnifiedRet(Value *V)
+{
+  // Quick checks first.
+  auto Inst = dyn_cast<CallInst>(V);
+  if (!Inst)
+    return nullptr;
+  if (Inst->getParent())
+    return nullptr;
+  // Then map lookup.
+  auto i = UnifiedRetToFunc.find(V);
+  if (i == UnifiedRetToFunc.end())
+    return nullptr;
+  return i->second;
+}
+
+/***********************************************************************
+ * moveUnifiedRet : move a function's unified return value to another function
+ *
+ * This is used when replacing a function with a new one in GenXArgIndirection.
+ */
+void GenXLiveness::moveUnifiedRet(Function *OldF, Function *NewF)
+{
+  auto i = UnifiedRets.find(OldF);
+  if (i == UnifiedRets.end())
+    return;
+  Value *UR = i->second;
+  UnifiedRets[NewF] = UR;
+  UnifiedRets.erase(i);
+  UnifiedRetToFunc[UR] = NewF;
+}
+
+/***********************************************************************
+ * find : given an instruction number, find a segment in a live range
+ *
+ * If the number is within a segment, or is just on its end point, that
+ * segment is returned. If the number is in a hole, the next segment
+ * after the hole is returned. If the number is before the first
+ * segment, the first segment is returned. If the number is after the
+ * last segment, end() is returned.
+ */
+LiveRange::iterator LiveRange::find(unsigned Pos)
+{
+  size_t Len = size();
+  if (!Len)
+    return end();
+  if (Pos > Segments[Len - 1].getEnd())
+    return end();
+  iterator I = begin();
+  do {
+    size_t Mid = Len >> 1;
+    if (Pos <= I[Mid].getEnd())
+      Len = Mid;
+    else
+      I += Mid + 1, Len -= Mid + 1;
+  } while (Len);
+  assert(I->getEnd() >= Pos);
+  return I;
+}
+
+/***********************************************************************
+ * getOrDefaultCategory : get category; if none, set default
+ *
+ * The default category is PREDICATE for i1 or a vector of i1, or GENERAL
+ * for anything else.
+ */
+unsigned LiveRange::getOrDefaultCategory()
+{
+  unsigned Cat = getCategory();
+  if (Cat != RegCategory::NONE)
+    return Cat;
+  assert(!value_empty());
+  SimpleValue SV = *value_begin();
+  Type *Ty = IndexFlattener::getElementType(
+      SV.getValue()->getType(), SV.getIndex());
+  if (Ty->getScalarType()->isIntegerTy(1))
+    Cat = RegCategory::PREDICATE;
+  else
+    Cat = RegCategory::GENERAL;
+  setCategory(Cat);
+  return Cat;
+}
+
+/***********************************************************************
+ * interfere : check whether two live ranges interfere
+ *
+ * Two live ranges interfere if there is a segment from each that overlap
+ * and they are considered to cause interference by
+ * checkIfOverlappingSegmentsInterfere below.
+ */
+bool GenXLiveness::interfere(LiveRange *LR1, LiveRange *LR2)
+{
+  return getSingleInterferenceSites(LR1, LR2, nullptr);
+}
+
+/***********************************************************************
+ * twoAddrInterfere : check whether two live ranges interfere, allowing for
+ *    single number interference sites at two address ops
+ *
+ * Return:  true if they interfere
+ *
+ * Two live ranges interfere if there is a segment from each that overlap
+ * and are not both weak.
+ *
+ * But, if each interfering segment is a single number that is the precopy
+ * site of a two address op, and the result of the two address op is in one LR
+ * and the two address operand is in the other, then that is not counted as
+ * interference.
+ *
+ * That provision allows for coalescing at a two address op where the two
+ * address operand has already been copy coalesced with, or is the same value
+ * as, a different operand in the same bale, as follows:
+ *
+ * Suppose the two address op a has number N, and it has two address operand b
+ * and some other operand c in the same bale:
+ *
+ * N-1: (space for precopy)
+ * N:   a = op(b, c)
+ *
+ * with live ranges
+ * a:[N-1,..)
+ * b:[..,N-1)
+ * c:[..,N)
+ *
+ * Then a and b can coalesce.
+ *
+ * But suppose b and c are the same value, or had previously been copy coalesced.
+ * Then we have live ranges
+ * a:[N-1,..)
+ * b,c:[..,N)
+ *
+ * and a and b now interfere needlessly.
+ *
+ * This function is called on an attempt to coalesce a and b (or rather the
+ * live range containing a and the live range containing b).  In it, we see
+ * that there is a single number segment of interference [N-1,N), where a is
+ * the result and b is the two address operand of the two address op at N. Thus
+ * we discount that segment of interference, and a and b can still coalesce.
+ *
+ * Note that this formulation allows for there to be multiple such sites because
+ * of multiple two address results being already coalesced together through phi
+ * nodes.
+ */
+bool GenXLiveness::twoAddrInterfere(LiveRange *LR1, LiveRange *LR2)
+{
+  SmallVector<unsigned, 4> Sites;
+  if (getSingleInterferenceSites(LR1, LR2, &Sites))
+    return true; // interferes, not just single number sites
+  if (Sites.empty())
+    return false; // does not interfere at all
+  // Put the single number sites in a set.
+  SmallSet<unsigned, 4> SitesSet;
+  LLVM_DEBUG(dbgs() << "got single number interference sites:");
+  for (auto i = Sites.begin(), e = Sites.end(); i != e; ++i) {
+    LLVM_DEBUG(dbgs() << " " << *i);
+    SitesSet.insert(*i);
+  }
+  LLVM_DEBUG(dbgs() << "\nbetween:\n" << *LR1 << "\n" << *LR2 << "\n");
+  Sites.clear();
+  // Check each def in LR1 and LR2 for being a two address op that causes us to
+  // discount a single number interference site.
+  for (auto LR = LR1, OtherLR = LR2; LR;
+      LR = LR == LR1 ? LR2 : nullptr, OtherLR = LR1) {
+    for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) {
+      auto CI = dyn_cast<CallInst>(vi->getValue());
+      if (!CI)
+        continue;
+      int OperandNum = getTwoAddressOperandNum(CI);
+      if (OperandNum < 0)
+        continue;
+      // Got a two addr op. Check whether the two addr operand is in the other
+      // LR.
+      if (getLiveRangeOrNull(CI->getOperand(OperandNum)) != OtherLR)
+        continue;
+      // Discount the single number interference site here, if there is one.
+      SitesSet.erase(getNumbering()->getNumber(CI) - 1);
+    }
+  }
+  // If we have discounted all sites, then there is no interference.
+  return !SitesSet.empty();
+}
+
+/***********************************************************************
+ * getSingleInterferenceSites : check whether two live ranges interfere,
+ *      returning single number interference sites
+ *
+ * Enter:   LR1, LR2 = live ranges to check
+ *          Sites = vector in which to store single number interference sites,
+ *                  or 0 if we do not want to collect them
+ *
+ * Return:  true if the live ranges interfere other than as reflected in Sites
+ *
+ * Two live ranges interfere if there is a segment from each that overlap
+ * and are not both weak.
+ *
+ * If Sites is 0 (the caller does not want the Sites list), then the function
+ * returns true if there is any interference.
+ *
+ * If Sites is not 0, then any interference in a single number segment, for
+ * example [19,20), causes the start number to be pushed into Sites. The
+ * function returns true only if there is interference that cannot be described
+ * in Sites.
+ */
+bool GenXLiveness::getSingleInterferenceSites(LiveRange *LR1, LiveRange *LR2,
+    SmallVectorImpl<unsigned> *Sites)
+{
+  // Swap if necessary to make LR1 the one with more segments.
+  if (LR1->size() < LR2->size())
+    std::swap(LR1, LR2);
+  auto Idx2 = LR2->begin(), End2 = LR2->end();
+  // Find segment in LR1 that contains or is the next after the start
+  // of the first segment in LR2, including the case that the start of
+  // the LR2 segment abuts the end of the LR1 segment.
+  auto Idx1 = LR1->find(Idx2->getStart()), End1 = LR1->end();
+  if (Idx1 == End1)
+    return false;
+  for (;;) {
+    // Check for overlap.
+    if (Idx1->getStart() < Idx2->getStart()) {
+      if (Idx1->getEnd() > Idx2->getStart())
+        if (checkIfOverlappingSegmentsInterfere(LR1, Idx1, LR2, Idx2)) {
+          // Idx1 overlaps Idx2. Check if it is a single number overlap that can
+          // be pushed into Sites.
+          if (!Sites || Idx1->getEnd() != Idx2->getStart() + 1)
+            return true;
+          Sites->push_back(Idx2->getStart());
+        }
+    } else {
+      if (Idx1->getStart() < Idx2->getEnd())
+        if (checkIfOverlappingSegmentsInterfere(LR1, Idx1, LR2, Idx2)) {
+          // Idx2 overlaps Idx1. Check if it is a single number overlap that can
+          // be pushed into Sites.
+          if (!Sites || Idx2->getEnd() != Idx1->getStart() + 1)
+            return true;
+          Sites->push_back(Idx1->getStart());
+        }
+    }
+    // Advance whichever one has the lowest End.
+    if (Idx1->getEnd() < Idx2->getEnd()) {
+      if (++Idx1 == End1)
+        return false;
+    } else {
+      if (++Idx2 == End2)
+        return false;
+    }
+  }
+}
+
+/***********************************************************************
+ * checkIfOverlappingSegmentsInterfere : given two segments that have been
+ *    shown to overlap, check whether their strengths make them interfere
+ *
+ * If both segments are weak, they do not interfere.
+ *
+ * Interference between a normal segment in one LR and a phicpy segment in the
+ * other LR is ignored, as long as the phicpy segment relates to a phi incoming
+ * where the phi node is in the LR with the phicpy segment and the incoming
+ * value is in the LR with the strong segment. This is used to avoid
+ * unnecessary interference for a phi incoming through a critical edge, where
+ * the incoming is likely to be used in the other successor as well.
+ */
+bool GenXLiveness::checkIfOverlappingSegmentsInterfere(
+    LiveRange *LR1, Segment *S1, LiveRange *LR2, Segment *S2)
+{
+  if (S1->isWeak() && S2->isWeak())
+    return false; // both segments weak
+  if (S2->Strength == Segment::PHICPY) {
+    // Swap so that if either segment is phicpy, then it is S1 for the check
+    // below.
+    std::swap(LR1, LR2);
+    std::swap(S1, S2);
+  }
+  if (S1->Strength != Segment::PHICPY)
+    return true;
+  // S1 is phicpy. If its corresponding phi cpy insertion point is for a phi
+  // node in LR1 and an incoming in LR2, then this does not cause interference.
+  auto PhiIncoming = Numbering->getPhiIncomingFromNumber(S1->getStart());
+  assert(PhiIncoming.first && "phi incoming not found");
+  if (getLiveRange(PhiIncoming.first) != LR1)
+    return true; // phi not in LR1, interferes
+  if (getLiveRangeOrNull(
+        PhiIncoming.first->getIncomingValue(PhiIncoming.second)) != LR2)
+    return true; // phi incoming not in LR2, interferes
+  // Conditions met -- does not cause interference.
+  return false;
+}
+
+/***********************************************************************
+ * coalesce : coalesce two live ranges that do not interfere
+ *
+ * Enter:   LR1 = first live range
+ *          LR2 = second live range
+ *          DisallowCASC = true to disallow call arg special coalescing
+ *                         into the resulting live range
+ *
+ * Return:  new live range (LR1 and LR2 now invalid)
+ */
+LiveRange *GenXLiveness::coalesce(LiveRange *LR1, LiveRange *LR2,
+    bool DisallowCASC)
+{
+  assert(LR1 != LR2 && "cannot coalesce an LR to itself");
+  assert(LR1->Category == LR2->Category && "cannot coalesce two LRs with different categories");
+  // Make LR1 the one with the longer list of segments.
+  if (LR2->Segments.size() > LR1->Segments.size()) {
+    LiveRange *temp = LR1;
+    LR1 = LR2;
+    LR2 = temp;
+  }
+  LLVM_DEBUG(
+    dbgs() << "Coalescing \"";
+    LR1->print(dbgs());
+    dbgs() << "\" and \"";
+    LR2->print(dbgs());
+    dbgs() << "\"\n"
+  );
+  // Do the merge of the segments.
+  merge(LR1, LR2);
+  // Copy LR2's values across to LR1.
+  for (auto i = LR2->value_begin(), e = LR2->value_end(); i != e; ++i)
+    LiveRangeMap[LR1->addValue(*i)] = LR1;
+  // Use the largest alignment from the two LRs.
+  LR1->LogAlignment = std::max(LR1->LogAlignment, LR2->LogAlignment);
+  // If either LR has a non-zero offset, use it.
+  assert(!LR1->Offset || !LR2->Offset);
+  LR1->Offset |= LR2->Offset;
+  // Set DisallowCASC.
+  LR1->DisallowCASC |= LR2->DisallowCASC | DisallowCASC;
+  delete LR2;
+  LLVM_DEBUG(
+    dbgs() << "  giving \"";
+    LR1->print(dbgs());
+    dbgs() << "\"\n"
+  );
+  return LR1;
+}
+
+/***********************************************************************
+ * copyInterfere : check whether two live ranges copy-interfere
+ *
+ * Two live ranges LR1 and LR2 copy-interfere (a non-commutative relation)
+ * if LR1 includes a value that is a phi node whose definition is within
+ * LR2.
+ */
+bool GenXLiveness::copyInterfere(LiveRange *LR1, LiveRange *LR2)
+{
+  // Find a phi node value in LR1. It can have at most one, because only
+  // copy coalescing has occurred up to now, and copy coalescing does not
+  // occur at a phi node.
+  for (unsigned i = 0, e = LR1->Values.size(); i != e; ++i) {
+    auto Phi = dyn_cast<PHINode>(LR1->Values[i].getValue());
+    if (!Phi)
+      continue;
+    // Found a phi node in LR1. A phi node has multiple instruction numbers,
+    // one for each incoming block. See if any one of those is in LR2's
+    // live range.
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i)
+      if (LR2->contains(Numbering->getPhiNumber(Phi, Phi->getIncomingBlock(i))))
+        return true;
+    break;
+  }
+  return false; // no phi node found
+}
+
+/***********************************************************************
+ * wrapsAround : detects if V1 is a phi node and V2 wraps round to a use
+ *              in a phi node in the same basic block as V1 and after it
+ */
+bool GenXLiveness::wrapsAround(Value *V1, Value *V2)
+{
+  auto PhiDef = dyn_cast<PHINode>(V1);
+  if (!PhiDef)
+    return false;
+  for (auto ui = V2->use_begin(), ue = V2->use_end(); ui != ue; ++ui) {
+    if (auto PhiUse = dyn_cast<PHINode>(ui->getUser())) {
+      if (PhiUse->getParent() == PhiDef->getParent()) {
+        // Phi use in the same BB. Scan until we find PhiDef or the end
+        // of the phi nodes.
+        while (PhiUse != PhiDef) {
+          PhiUse = dyn_cast<PHINode>(PhiUse->getNextNode());
+          if (!PhiUse)
+            return true; // reach end of phi nodes
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/***********************************************************************
+ * insertCopy : insert a copy of a non-struct value
+ *
+ * Enter:   InputVal = value to copy
+ *          LR = live range to add the new value to (0 to avoid adjusting
+ *                live ranges)
+ *          InsertBefore = insert copy before this inst
+ *          Name = name to give the new value
+ *          Number = number to give the new instruction(s), 0 for none
+ *
+ * Return:  The new copy instruction
+ *
+ * This inserts multiple copies if the input value is a vector that is
+ * bigger than two GRFs or a non power of two size.
+ *
+ * This method is mostly used from GenXCoalescing, which passes an LR to
+ * add the new copied value to.
+ *
+ * It is also used from GenXLiveRange if it needs to add a copy to break an
+ * overlapping circular phi value, in which case LR is 0 as we do not want to
+ * adjust live ranges. Also at this stage there is no baling info to update.
+ */
+Instruction *GenXLiveness::insertCopy(Value *InputVal, LiveRange *LR,
+    Instruction *InsertBefore, const Twine &Name, unsigned Number)
+{
+  assert(!isa<Constant>(InputVal));
+  bool AdjustLRs = LR != nullptr;
+  LiveRange *SourceLR = nullptr;
+  if (AdjustLRs)
+    SourceLR = getLiveRange(InputVal);
+  auto InputTy = InputVal->getType();
+  if (InputTy->getScalarType()->isIntegerTy(1)) {
+    // The input is a predicate.
+    if (!isa<Constant>(InputVal)) {
+      // The predicate input is not a constant.
+      // There is no way in vISA of copying from one
+      // predicate to another, so we copy all 0s into the destination
+      // then "or" the source into it.
+      Instruction *NewInst = CastInst::Create(Instruction::BitCast,
+          Constant::getNullValue(InputTy), InputTy, Name, InsertBefore);
+      Numbering->setNumber(NewInst, Number);
+      if (AdjustLRs)
+        setLiveRange(SimpleValue(NewInst), LR);
+      NewInst = BinaryOperator::Create(Instruction::Or, NewInst, InputVal, Name,
+          InsertBefore);
+      Numbering->setNumber(NewInst, Number);
+      if (AdjustLRs)
+        setLiveRange(SimpleValue(NewInst), LR);
+      return NewInst;
+    }
+    // Predicate input is constant.
+    auto NewInst = CastInst::Create(Instruction::BitCast, InputVal,
+        InputTy, Name, InsertBefore);
+    Numbering->setNumber(NewInst, Number);
+    if (AdjustLRs)
+      setLiveRange(SimpleValue(NewInst), LR);
+    return NewInst;
+  }
+  Instruction *NewInst = nullptr;
+  if (InputTy->isPointerTy()) {
+    // BitCast used to represent a normal copy.
+    NewInst = CastInst::Create(Instruction::BitCast, InputVal,
+                               InputVal->getType(), Name, InsertBefore);
+    if (Number)
+      Numbering->setNumber(NewInst, Number);
+    if (AdjustLRs)
+      setLiveRange(SimpleValue(NewInst), LR);
+    return NewInst;
+  }
+
+  Region R(InputVal);
+  unsigned MaxNum = R.ElementBytes == 1 ? 32 : 64 / R.ElementBytes;
+  if (exactLog2(R.NumElements) >= 0 && R.NumElements <= MaxNum) {
+    // Can be done with a single copy.
+    if (SourceLR && (SourceLR->Category != RegCategory::GENERAL
+        || (LR && LR->Category != RegCategory::GENERAL))) {
+      // Need a category conversion (including the case that the two
+      // categories are the same but not GENERAL).
+      NewInst = createConvert(InputVal, Name, InsertBefore);
+    } else {
+      // BitCast used to represent a normal copy.
+      NewInst = CastInst::Create(Instruction::BitCast, InputVal,
+          InputVal->getType(), Name, InsertBefore);
+    }
+    if (Number)
+      Numbering->setNumber(NewInst, Number);
+    if (AdjustLRs)
+      setLiveRange(SimpleValue(NewInst), LR);
+    return NewInst;
+  }
+
+  auto collectFragment = [](Value *V, unsigned MaxFrag,
+                         SmallVectorImpl<std::pair<unsigned, unsigned>>& Frag,
+                         unsigned MaxElt) {
+    while (!isa<UndefValue>(V)) {
+      if (!GenXIntrinsic::isWrRegion(V))
+        return true;
+      IntrinsicInst *WII = cast<IntrinsicInst>(V);
+      Region R(WII, BaleInfo());
+      if (R.Indirect || !R.isContiguous() || !R.isWholeNumRows())
+        return true;
+      if ((R.Offset % R.ElementBytes) != 0)
+        return true;
+      unsigned Base = R.Offset / R.ElementBytes;
+      for (unsigned Offset = 0; Offset < R.NumElements; /*EMPTY*/) {
+        unsigned NumElts = std::min(MaxElt, R.NumElements - Offset);
+        // Round NumElts down to power of 2. That is how many elements we
+        // are copying this time round the loop.
+        NumElts = 1 << genx::log2(NumElts);
+        Frag.push_back(std::make_pair(Base + Offset, NumElts));
+        Offset += NumElts;
+      }
+      V = WII->getOperand(0);
+    }
+    if (Frag.size() > MaxFrag)
+      return true;
+    std::sort(Frag.begin(), Frag.end());
+    return false;
+  };
+
+  unsigned NumElements = R.NumElements;
+  SmallVector<std::pair<unsigned, unsigned>, 8> Fragments;
+  unsigned MaxCopies = (NumElements + MaxNum - 1) / MaxNum;
+  if (collectFragment(InputVal, MaxCopies, Fragments, MaxNum)) {
+    Fragments.clear();
+    for (unsigned Offset = 0; Offset < NumElements; /*EMPTY*/) {
+      unsigned NumElts = std::min(MaxNum, NumElements - Offset);
+      // Round NumElts down to power of 2. That is how many elements we
+      // are copying this time round the loop.
+      NumElts = 1 << genx::log2(NumElts);
+      Fragments.push_back(std::make_pair(Offset, NumElts));
+      Offset += NumElts;
+    }
+  }
+  // Need to split the copy up. Start with an undef destination.
+  Value *Res = UndefValue::get(InputVal->getType());
+  for (auto &I : Fragments) {
+    unsigned Offset = I.first;
+    // Set the subregion.
+    R.NumElements = I.second;
+    R.Width = R.NumElements;
+    R.Offset = Offset * R.ElementBytes;
+    // Create the rdregion. Do not add this to a live range because it is
+    // going to be baled in to the wrregion.
+    Instruction *RdRegion = R.createRdRegion(InputVal, Name, InsertBefore,
+        DebugLoc(), true/*AllowScalar*/);
+    if (Baling)
+      Baling->setBaleInfo(RdRegion, BaleInfo(BaleInfo::RDREGION, 0));
+    if (Number)
+      Numbering->setNumber(RdRegion, Number);
+    // Create the wrregion, and mark that it bales in the rdregion (operand 1).
+    NewInst = cast<Instruction>(R.createWrRegion(Res, RdRegion, Name,
+          InsertBefore, DebugLoc()));
+    if (Number)
+      Numbering->setNumber(NewInst, Number);
+    if (Baling) {
+      BaleInfo BI(BaleInfo::WRREGION);
+      BI.setOperandBaled(1);
+      Baling->setBaleInfo(NewInst, BI);
+    }
+    if (AdjustLRs) {
+      // Add the last wrregion to the live range (thus coalescing them all
+      // together and in with the phi node or two address op that we're doing
+      // the copy for).
+      setLiveRange(SimpleValue(NewInst), LR);
+    }
+    Res = NewInst;
+  }
+  return NewInst;
+}
+
+/***********************************************************************
+ * merge : merge segments of LR2 into LR1
+ *
+ * This is equivalent to addSegments followed by sortAndMerge.
+ *
+ * Previously there was some code here that attempted to optimize on the
+ * assumption that the caller passed the one with the longer list of segments
+ * as LR1. However that became too complicated once we introduced weak and
+ * strong liveness.
+ *
+ * One day we could re-introduce some simple optimized paths, such as when
+ * LR2 has a single segment.
+ */
+void GenXLiveness::merge(LiveRange *LR1, LiveRange *LR2)
+{
+  LR1->addSegments(LR2);
+  LR1->sortAndMerge();
+}
+
+/***********************************************************************
+ * eraseUnusedTree : erase unused tree of instructions
+ *
+ * Enter:   Inst = root of unused tree
+ *
+ * This erases Inst, then recursively erases other instructions that become
+ * unused. Erased instructions are also removed from liveness.
+ *
+ * Other than the given Inst, this does not erase a non-intrinsic call, or
+ * an intrinsic call with a side effect.
+ *
+ * Instead of erasing as we go, we undef operands to make them unused and then
+ * erase everything at the end. This is required for the case that we have an
+ * unused DAG of instructions rather than just an unused tree, for example
+ * where we have a rd-wr sequence and all the rds use the same input.
+ */
+void GenXLiveness::eraseUnusedTree(Instruction *TopInst)
+{
+  SmallVector<Instruction *, 4> Stack;
+  std::set<Instruction *> ToErase;
+  Stack.push_back(TopInst);
+  while (!Stack.empty()) {
+    auto Inst = Stack.back();
+    Stack.pop_back();
+    if (!Inst->use_empty())
+      continue;
+    if (TopInst != Inst) {
+      if (auto CI = dyn_cast<CallInst>(Inst)) {
+        if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI))
+          continue;
+        if (!CI->doesNotAccessMemory())
+          continue;
+      }
+    }
+    for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi)
+      if (auto OpndInst = dyn_cast<Instruction>(Inst->getOperand(oi))) {
+        Stack.push_back(OpndInst);
+        Inst->setOperand(oi, UndefValue::get(OpndInst->getType()));
+      }
+    removeValue(Inst);
+    ToErase.insert(Inst);
+  }
+  for (auto i = ToErase.begin(), e = ToErase.end(); i != e; ++i)
+    (*i)->eraseFromParent();
+}
+
+/***********************************************************************
+ * getAddressBase : get the base register of an address
+ *
+ * Enter:   Addr = address conversion (genx.convert.addr instruction)
+ *
+ * Return:  The Value for the base that the address is used with, or some
+ *          other Value that is coalesced with that
+ */
+Value *GenXLiveness::getAddressBase(Value *Addr)
+{
+  // Get the base register from the rdregion/wrregion that the index is used
+  // in. This might involve going via an add or an rdregion.
+  Use *U = &*Addr->use_begin();
+  auto user = cast<Instruction>(U->getUser());
+  while (!U->getOperandNo()) {
+    U = &*user->use_begin();
+    user = cast<Instruction>(U->getUser());
+  }
+  if (GenXIntrinsic::isRdRegion(user))
+    return user->getOperand(0);
+  if (GenXIntrinsic::isWrRegion(user)) {
+    auto Head = Baling->getBaleHead(user);
+    if (Head && isa<StoreInst>(Head)) {
+      Value *V = Head->getOperand(1);
+      V = getUnderlyingGlobalVariable(V);
+      assert(V && "null base not expected");
+      return V;
+    }
+    return user;
+  }
+  // The above scheme does not work for an address conversion added by
+  // GenXArgIndirection. Instead we have AddressBaseMap to provide the mapping.
+  auto i = ArgAddressBaseMap.find(Addr);
+  assert(i != ArgAddressBaseMap.end() && "base register not found for address");
+  Value *BaseV = i->second;
+  LiveRange *LR = getLiveRange(BaseV);
+  // Find a SimpleValue in the live range that is not a struct member.
+  for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) {
+    Value *V = vi->getValue();
+    if (!isa<StructType>(V->getType()))
+      return V;
+  }
+  llvm_unreachable("non-struct value not found");
+}
+
+/***********************************************************************
+ * isBitCastCoalesced : see if the bitcast has been coalesced away
+ *
+ * This handles the case that the input and result of the bitcast are coalesced
+ * in to the same live range.
+ */
+bool GenXLiveness::isBitCastCoalesced(BitCastInst *BCI)
+{
+  return getLiveRangeOrNull(BCI) == getLiveRangeOrNull(BCI->getOperand(0));
+}
+
+/***********************************************************************
+ * dump, print : dump the liveness info
+ */
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void GenXLiveness::dump()
+{
+  print(errs()); errs() << '\n';
+}
+void LiveRange::dump() const
+{
+  print(errs()); errs() << '\n';
+}
+#endif
+
+void GenXLiveness::print(raw_ostream &OS) const
+{
+  OS << "GenXLiveness for FunctionGroup " << FG->getName() << "\n";
+  for (const_iterator i = begin(), e = end(); i != e; ++i) {
+    LiveRange *LR = i->second;
+    // Only show an LR if the map iterator is on the value that appears first
+    // in the LR. That avoids printing the same LR multiple times.
+    if (i->first == *LR->value_begin()) {
+      LR->print(OS);
+      OS << "\n";
+    }
+  }
+  OS << "\n";
+}
+
+#ifndef NDEBUG
+/***********************************************************************
+ * LiveRange::assertOk : assert that no segments abut or overlap or are
+ *                       in the wrong order
+ */
+void LiveRange::assertOk()
+{
+  // Assert that no segments abut or overlap or are in the wrong order.
+  iterator Idx1 = begin(), End1 = end();
+  Idx1++;
+  for (; Idx1 != End1; ++Idx1)
+    assert(((Idx1 - 1)->Strength != Idx1->Strength ||
+            (Idx1 - 1)->getEnd() < Idx1->getStart()) &&
+           "invalid live range");
+}
+#endif
+
+/***********************************************************************
+ * LiveRange::addSegment : add a segment to a live range
+ *
+ * The segment might already be covered by an existing segment, in which
+ * case nothing changes.
+ *
+ * It would be inefficient to implement coalesce() in terms of this, because
+ * it might have to shuffle lots of elements along by one each time.
+ * This function only gets called when adding a single segment to a live
+ * range when inserting a copy in coalescing.
+ */
+void LiveRange::addSegment(Segment Seg)
+{
+  iterator i = find(Seg.getStart()), e = end();
+  if (i == e) {
+    // New segment off end.
+    Segments.push_back(Seg);
+  } else if (i->getStart() <= Seg.getStart()) {
+    // New segment is covered by or overlaps the end of old segment i.
+    if (i->getEnd() < Seg.getEnd()) {
+      i->setEnd(Seg.getEnd());
+      // See if it covers or overlaps any later segments.
+      iterator j = i + 1;
+      while (j != e) {
+        if (j->getStart() > Seg.getEnd())
+          break;
+        i->setEnd(j->getEnd());
+        if (j->getEnd() >= Seg.getEnd())
+          break;
+        ++j;
+      }
+      Segments.erase(i + 1, j);
+    }
+  } else if (i->getStart() == Seg.getEnd()) {
+    // New segment abuts start of old segment i, without abutting or
+    // overlapping anything before.
+    i->setStart(Seg.getStart());
+  } else {
+    // New segment is completely in a hole just before i.
+    Segments.insert(i, Seg);
+  }
+  assertOk();
+}
+
+/***********************************************************************
+ * LiveRange::setSegmentsFrom : for this live range, clear out its segments
+ *    and copy them from the other live range
+ */
+void LiveRange::setSegmentsFrom(LiveRange *Other)
+{
+  Segments.clear();
+  Segments.append(Other->Segments.begin(), Other->Segments.end());
+}
+
+/***********************************************************************
+ * LiveRange::addSegments : add segments of LR2 into this
+ */
+void LiveRange::addSegments(LiveRange *LR2)
+{
+  Segments.append(LR2->Segments.begin(), LR2->Segments.end());
+}
+
+/***********************************************************************
+ * LiveRange::sortAndMerge : after doing some push_backs, sort the segments,
+ *      and merge overlapping/adjacent ones
+ */
+void LiveRange::sortAndMerge() {
+  std::sort(Segments.begin(), Segments.end());
+
+  // Ensure that there are no duplicate segments:
+  Segments_t::iterator ip;
+  ip = std::unique(Segments.begin(), Segments.end());
+  Segments.resize(std::distance(Segments.begin(), ip));
+
+  Segments_t SegmentsSortedEnd = Segments;
+  std::sort(SegmentsSortedEnd.begin(), SegmentsSortedEnd.end(),
+            [](Segment L, Segment R) {
+              if (L.getEnd() != R.getEnd())
+                return L.getEnd() < R.getEnd();
+              return L.getStart() < R.getStart();
+            });
+
+  Segments_t NewSegments;
+  std::unordered_set<Segment> OpenedSegments;
+  Segment *SS = Segments.begin();
+  Segment *ES = SegmentsSortedEnd.begin();
+  unsigned prevBorder = 0;
+  unsigned curBorder = 0;
+  bool isStartBorder;
+
+  // Split & Merge
+  while (ES != SegmentsSortedEnd.end()) {
+    if (SS != Segments.end() && SS->getStart() < ES->getEnd()) {
+      isStartBorder = true;
+      curBorder = SS->getStart();
+    } else {
+      isStartBorder = false;
+      curBorder = ES->getEnd();
+    }
+
+    // To create or extend segment, first check that there are
+    // open segments or that we haven't already created or extended one
+    if (OpenedSegments.size() > 0 && prevBorder < curBorder) {
+      Segment NS =
+          *std::max_element(OpenedSegments.begin(), OpenedSegments.end(),
+                            [](Segment L, Segment R) {
+                               return L.Strength < R.Strength;
+                            }); // New segment
+      if (NewSegments.size() > 0 &&
+          NewSegments.rbegin()->getEnd() == prevBorder &&
+          // This segment and previous segment abut or overlap. Merge
+          // as long as they have the same strength.
+          (NS.Strength == NewSegments.rbegin()->Strength ||
+           // Also allow for the case that the first one is strong and the
+           // second one is phicpy. The resulting merged segment is strong,
+           // because a phicpy segment is valid only if it starts in the
+           // same place as when it was originally created and there is no
+           // liveness just before it.
+           (NS.Strength == Segment::PHICPY &&
+            NewSegments.rbegin()->Strength == Segment::STRONG))) {
+        // In these cases we can extend
+        NewSegments.rbegin()->setEnd(curBorder);
+      } else {
+        NS.setStart(prevBorder);
+        NS.setEnd(curBorder);
+        NewSegments.push_back(NS);
+      }
+    }
+    prevBorder = curBorder;
+    if (isStartBorder)
+      OpenedSegments.insert(*SS++);
+    else
+      OpenedSegments.erase(*ES++);
+  }
+  Segments = NewSegments;
+}
+
+/***********************************************************************
+ * LiveRange::prepareFuncs : fill the Funcs set with kernel or stack functions
+ * which this LR is alive in
+ *
+ * To support RegAlloc for function groups that consist of kernel and stack
+ * functions we have to track which kernel/stack functions the LR spans across.
+ *
+ */
+void LiveRange::prepareFuncs(FunctionGroupAnalysis *FGA) {
+  for (auto &val : getValues()) {
+    auto Inst = dyn_cast<Instruction>(val.getValue());
+    Function *DefFunc = nullptr;
+    if (Inst && Inst->getParent())
+      DefFunc = Inst->getFunction();
+    else if (auto Arg = dyn_cast<Argument>(val.getValue()))
+      DefFunc = Arg->getParent();
+
+    if (DefFunc)
+      Funcs.insert(FGA->getSubGroup(DefFunc)
+        ? FGA->getSubGroup(DefFunc)->getHead()
+        : FGA->getGroup(DefFunc)->getHead());
+
+    for (auto U : val.getValue()->users())
+      if (Instruction *userInst = dyn_cast<Instruction>(U)) {
+        auto F = userInst->getFunction();
+        Funcs.insert(FGA->getSubGroup(F) ? FGA->getSubGroup(F)->getHead()
+                                         : FGA->getGroup(F)->getHead());
+      }
+  }
+}
+
+/***********************************************************************
+ * LiveRange::getLength : add up the number of instructions covered by this LR
+ */
+unsigned LiveRange::getLength(bool WithWeak)
+{
+  unsigned Length = 0;
+  for (auto i = begin(), e = end(); i != e; ++i) {
+    if (i->isWeak() && !WithWeak)
+      continue;
+    Length += i->getEnd() - i->getStart();
+  }
+  return Length;
+}
+
+/***********************************************************************
+ * LiveRange::print : print the live range
+ */
+void LiveRange::print(raw_ostream &OS) const
+{
+  auto vi = Values.begin(), ve = Values.end();
+  assert(vi != ve);
+  for (;;) {
+    vi->printName(OS);
+    if (++vi == ve)
+      break;
+    OS << ",";
+  }
+  OS << ":";
+  printSegments(OS);
+  const char *Cat = "???";
+  switch (Category) {
+    case RegCategory::NONE: Cat = "none"; break;
+    case RegCategory::GENERAL: Cat = "general"; break;
+    case RegCategory::ADDRESS: Cat = "address"; break;
+    case RegCategory::PREDICATE: Cat = "predicate"; break;
+    case RegCategory::EM: Cat = "em"; break;
+    case RegCategory::RM: Cat = "rm"; break;
+    case RegCategory::SAMPLER: Cat = "sampler"; break;
+    case RegCategory::SURFACE: Cat = "surface"; break;
+    case RegCategory::VME: Cat = "vme"; break;
+  }
+  OS << "{" << Cat << ",align" << (1U << LogAlignment);
+  if (Offset)
+    OS << ",offset" << Offset;
+  OS << "}";
+}
+
+/***********************************************************************
+ * LiveRange::printSegments : print the live range's segments
+ */
+void LiveRange::printSegments(raw_ostream &OS) const
+{
+  for (auto ri = Segments.begin(), re = Segments.end();
+      ri != re; ++ri) {
+    OS << "[";
+    switch (ri->Strength) {
+      case Segment::WEAK: OS << "w"; break;
+      case Segment::PHICPY: OS << "ph"; break;
+    }
+    OS << ri->getStart() << "," << ri->getEnd() << ")";
+  }
+}
+
+/***********************************************************************
+ * IndexFlattener::flatten : convert struct indices into a flattened index
+ *
+ * This has a special case of Indices having a single element that is the
+ * number of elements in ST, which returns the total number of flattened
+ * indices in the struct.
+ *
+ * This involves scanning through the struct layout each time it is called.
+ * If it is used a lot, it might benefit from some cacheing of the results.
+ */
+unsigned IndexFlattener::flatten(StructType *ST, ArrayRef<unsigned> Indices)
+{
+  if (!Indices.size())
+    return 0;
+  unsigned Flattened = 0;
+  unsigned i = 0;
+  for (; i != Indices[0]; ++i) {
+    Type *ElTy = ST->getElementType(i);
+    if (auto ElST = dyn_cast<StructType>(ElTy))
+      Flattened += flatten(ElST, ElST->getNumElements());
+    else
+      ++Flattened;
+  }
+  if (i == ST->getNumElements())
+    return Flattened; // handle special case noted at the top
+  Type *ElTy = ST->getElementType(i);
+  if (auto ElST = dyn_cast<StructType>(ElTy))
+    Flattened += flatten(ElST, Indices.slice(1));
+  return Flattened;
+}
+
+/***********************************************************************
+ * IndexFlattener::unflatten : convert flattened index into struct indices
+ *
+ * Enter:   Indices = vector to put unflattened indices into
+ *
+ * Return:  number left over from flattened index if it goes off the end
+ *          of the struct (used internally when recursing). If this is
+ *          non-zero, nothing has been pushed into Indices
+ *
+ * This involves scanning through the struct layout each time it is called.
+ * If it is used a lot, it might benefit from some cacheing of the results.
+ */
+unsigned IndexFlattener::unflatten(StructType *ST, unsigned Flattened,
+    SmallVectorImpl<unsigned> *Indices)
+{
+  for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+    Type *ElTy = ST->getElementType(i);
+    if (auto ElST = dyn_cast<StructType>(ElTy)) {
+      Indices->push_back(i);
+      Flattened = unflatten(ElST, Flattened, Indices);
+      if (!Flattened)
+        return 0;
+      Indices->pop_back();
+    } else if (!Flattened--) {
+      Indices->push_back(i);
+      return 0;
+    }
+  }
+  return Flattened;
+}
+
+/***********************************************************************
+ * IndexFlattener::getElementType : get type of struct element from
+ *    flattened index
+ *
+ * Enter:   Ty = type, possibly struct type
+ *          FlattenedIndex = flattened index in the struct, 0 if not struct
+ *
+ * Return:  type of that element
+ */
+Type *IndexFlattener::getElementType(Type *Ty, unsigned FlattenedIndex)
+{
+  auto ST = dyn_cast<StructType>(Ty);
+  if (!ST)
+    return Ty;
+  SmallVector<unsigned, 4> Indices;
+  IndexFlattener::unflatten(ST, FlattenedIndex, &Indices);
+  Type *T = 0;
+  for (unsigned i = 0;;) {
+    T = ST->getElementType(Indices[i]);
+    if (++i == Indices.size())
+      return T;
+    ST = cast<StructType>(T);
+  }
+}
+
+/***********************************************************************
+ * IndexFlattener::flattenArg : flatten an arg in a function or call
+ *
+ * This calculates the total number of flattened indices used up by previous
+ * args. If all previous args are not struct type, then this just returns the
+ * arg index.
+ */
+unsigned IndexFlattener::flattenArg(FunctionType *FT, unsigned ArgIndex)
+{
+  unsigned FlattenedIndex = 0;
+  while (ArgIndex--) {
+    Type *ArgTy = FT->getParamType(ArgIndex);
+    FlattenedIndex += getNumElements(ArgTy);
+  }
+  return FlattenedIndex;
+}
+
+/***********************************************************************
+ * SimpleValue::getType : get the type of the SimpleValue
+ */
+Type *SimpleValue::getType()
+{
+  return IndexFlattener::getElementType(V->getType(), Index);
+}
+
+/***********************************************************************
+ * dump, print : debug print a SimpleValue
+ */
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void SimpleValue::dump() const
+{
+  print(errs()); errs() << '\n';
+}
+#endif
+void SimpleValue::print(raw_ostream &OS) const
+{
+  OS << V->getName();
+  if (Index || isa<StructType>(V->getType()))
+    OS << "#" << Index;
+}
+void SimpleValue::printName(raw_ostream &OS) const
+{
+  OS << V->getName();
+  if (Index || isa<StructType>(V->getType()))
+    OS << "#" << Index;
+}
+
+/***********************************************************************
+ * CallGraph::build : build the call graph for the FunctionGroup
+ *
+ * The call graph is acyclic because no recursive edges added here
+ * CM supports recursion though
+ */
+void CallGraph::build(GenXLiveness *Liveness)
+{
+  Nodes.clear();
+  // Create a node for each Function.
+  for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    (void)Nodes[F];
+  }
+  // For each Function, find its call sites and add edges for them.
+  for (auto fgi = FG->begin() + 1, fge = FG->end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    for (Value::use_iterator ui = F->use_begin(), ue = F->use_end();
+        ui != ue; ++ui) {
+      // TODO: deduce possible callsites thru cast chains
+      if (isa<CallInst>(ui->getUser())) {
+        auto Call = cast<CallInst>(ui->getUser());
+        auto Caller = Call->getParent()->getParent();
+        // do not add edges for recursive calls
+        if (Caller != F)
+          Nodes[Caller].insert(
+              Edge(Liveness->getNumbering()->getNumber(Call), Call));
+      }
+    }
+  }
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.h
new file mode 100644
index 000000000000..1ce86c0dbbdd
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.h
@@ -0,0 +1,666 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXLiveness
+/// ------------
+///
+/// GenXLiveness is an analysis that contains the liveness information for the
+/// values in the code. Unlike the usual LLVM liveness analysis, the values
+/// are in LLVM IR rather than machine IR.
+///
+/// This GenXLiveness pass is a container for the data structures required
+/// for liveness analysis, plus methods to perform the analysis. The pass itself
+/// does nothing; later passes manipulate it:
+///
+/// * GenXCategory creates a LiveRange and sets the category on it for each
+///   value.
+///
+/// * GenXLiveRanges calls GenXLiveness to set up the LiveRange for each
+///   value that needs one (a non-baled instruction or a function argument),
+///   and erases the LiveRange for a value that does not need one (a baled
+///   in instruction).
+///
+/// GenXLiveness is a FunctionGroupPass, because we want to share liveness
+/// information between all the Functions in a FunctionGroup (i.e. between a
+/// GenX kernel/function and its subroutines). Any pass that uses GenXLiveness,
+/// which is almost all passes that run after it, must itself be a
+/// FunctionGroupPass.
+///
+/// Here is what a LiveRange might look like if you dump() it in the debugger,
+/// or see it as part of the liveness info in a -print-after-all:
+///
+/// ``add12.split48172:[145,199){general,align32}``
+///
+/// * ``add12.split48172`` is the Value attached to the LiveRange. As outlined below,
+///   a LiveRange actually has SimpleValues rather than Values; if the attached
+///   SimpleValue had been an element of a struct rather than a scalar value in
+///   its own right, the name would have had # then the flattened index appended.
+///
+/// * A LiveRange can have more than one value attached after GenXCoalescing.
+///   This would be shown by multiple comma-separated names.
+///
+/// * ``[145,199)`` is the segment in which the LiveRange is live. A LiveRange can
+///   have multiple segments. This one is a normal (strong) segment; a weak one has
+///   the start number prefixed with 'w' and a phicpy one has the start number
+///   prefixed with 'ph'.
+///
+/// * ``general`` is the register category of the LiveRange.
+///
+/// * ``align32`` shows that the LiveRange has been marked as needing to be 32
+///   byte (i.e. GRF) aligned.
+///
+/// * If the LiveRange was a kernel argument, its allocated offset would have
+///   been shown with the word 'offset'.
+///
+/// SimpleValue
+/// ^^^^^^^^^^^
+///
+/// Liveness information deals with SimpleValues rather than Values.
+/// SimpleValue (a GenX backend specific class) is the entity that can have
+/// a live range attached and a register allocated. A SimpleValue is either a
+/// non-struct Value, or a non-struct element of a struct Value (where the
+/// struct can contain nested structs).
+///
+/// A SimpleValue is represented by a pair:
+///
+/// - a Value *
+/// - a flattened index for a non-struct element of a struct, otherwise 0
+///
+/// Having a flattened index (as generated by IndexFlattener::flatten()) allows
+/// us to encode an element in multiply nested structs with a single index.
+///
+/// The idea of SimpleValue is that, where the LLVM IR contains a struct value,
+/// which is unavoidable when a function has multiple return values, we want
+/// to allocate a register to each non-struct element, not the whole struct.
+///
+/// Segments
+/// ^^^^^^^^
+///
+/// A live range consists of one or more non-overlapping *segments*, where each
+/// segment has a start (inclusive) and end (exclusive) instruction number, and a
+/// strength, which is strong (normal), weak (see below) or phicpy (see below).
+/// Two segments cannot be abutting if they have the same
+/// strength. Later passes can interrogate this information to find out whether
+/// two live ranges interfere, and can modify it by coalescing (merging) two
+/// live ranges. After coalescing, multiple SimpleValues share the same live
+/// range.
+///
+/// The numbering of instructions is handled in GenXNumbering.
+///
+/// Weak liveness
+/// ^^^^^^^^^^^^^
+///
+/// A live range that extends over a call has the entire range of the called
+/// subroutine, and any subroutines it can call, added to it. This makes that
+/// live range interfere with any live range inside the subroutine, and thus
+/// stops them using the same register.
+///
+/// However, because a subroutine has a single range in instruction numbering,
+/// rather than one range per call site, this scheme means that two values A
+/// and B that are live over two *different* call sites of the same subroutine
+/// both include the subroutine's range, and thus look like they interfere.
+/// This could stop A and B being coalesced, and thus add extra code and
+/// register pressure.
+///
+/// To fix this, we have the concept of *weak liveness*. The values A and B
+/// are only weakly live inside the subroutine. Two values are considered to
+/// interfere only if there is some point where both are live, and at least
+/// one of them is not weakly live at that point.
+///
+/// Thus, in our A and B example, A and B each interferes with any value inside
+/// the subroutine, but not with each other.
+///
+/// Phicpy liveness
+/// ^^^^^^^^^^^^^^^
+///
+/// A phi node has a short segment of liveness (a *phicpy segment*) at the end
+/// of each of its incoming blocks, from the phi copy insertion point up to the
+/// end of the block. The use of the incoming value in the phi node is counted
+/// as being at that phi copy insertion point.
+///
+/// Normally, we split critical edges, so an incoming block to a phi node has
+/// only the one successor, and the use of the incoming value at the phi copy
+/// insertion point is a kill use. Often, the phi node and the incoming can be
+/// coalesced, unless there is some interference elsewhere due to other values
+/// previously coalesced into the two live ranges.
+///
+/// However, in one case (a goto/join branching to a join), we cannot split the
+/// critical edge. Thus the phi copy insertion point is before the conditional
+/// branch in a block with two successors, and the incoming value is likely to
+/// be used in the other successor too. Then, there is interference between the
+/// phi node and the incoming value, even though they could be coalesced.
+///
+/// To avoid this problem, each phicpy segment in a live range is marked as
+/// such. A phicpy segment is valid only if there is no segment abutting it
+/// before; if there is an abutting before segment, the coalescing code turns it
+/// into a normal strong segment and merges the two together.
+///
+/// Then, interference between two live ranges LR1 and LR2 is ignored if:
+///
+/// 1. the interference arises between a phicpy segment in LR1 and a normal
+///    (strong) segment in LR2; and
+///
+/// 2. the start of the phicpy segment is the phi copy insertion point where the
+///    phi node is in LR1 and the incoming value is in LR2.
+///
+/// This then allows the incoming value and the phi node to be coalesced, even
+/// if the incoming value is also used in the branch's other successor.
+///
+//===----------------------------------------------------------------------===//
+#ifndef GENXLIVENESS_H
+#define GENXLIVENESS_H
+
+#include "FunctionGroup.h"
+#include "IgnoreRAUWValueMap.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+class BasicBlock;
+class BitCastInst;
+class CallInst;
+class Function;
+class FunctionPass;
+class GenXBaling;
+class GenXLiveness;
+class GenXNumbering;
+class Instruction;
+class PHINode;
+class raw_ostream;
+class ReturnInst;
+class Value;
+
+FunctionGroupPass *createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner);
+
+namespace genx {
+
+class Bale;
+
+/***********************************************************************
+ * IndexFlattener : a class containing some (static) utility functions to
+ * convert between struct indices (as found in an extractelement instruction)
+ * and a flattened index, in which a struct containing further structs is
+ * flattened as if it is a single struct containing just the non-struct
+ * elements.
+ *
+ * SimpleValue uses this to encode and decode its flattened index.
+ * Liveness and coalescing use flattenArg and getNumArgElements to calculate
+ * live ranges for function args at the call sites.
+ */
+struct IndexFlattener {
+  // flatten : convert struct indices into a flattened index
+  static unsigned flatten(StructType *ST, ArrayRef<unsigned> Indices);
+  // getNumElements : get the number of non-struct elements in the flattened
+  // struct. Returns 1 if it is not a struct type, but 0 for void type.
+  static unsigned getNumElements(Type *Ty) {
+    if (auto ST = dyn_cast<StructType>(Ty))
+      return flatten(ST, ST->getNumElements());
+    return !Ty->isVoidTy();
+  }
+  // unflatten : convert a flattened index back into normal struct indices
+  static unsigned unflatten(StructType *ST, unsigned Unflattened, SmallVectorImpl<unsigned> *Indices);
+  // getElementType : get type of struct element from flattened index
+  static Type *getElementType(Type *Ty, unsigned FlattenedIndex);
+  // flattenArg : flatten an arg in a function or call, i.e. calculate the
+  //    total number of flattened indices used up by previous args. If all
+  //    previous args are not struct type, then this just returns the arg
+  //    index
+  static unsigned flattenArg(FunctionType *FT, unsigned ArgIndex);
+  // getNumArgElements : get the number of non-struct elements in all args
+  //    of the function
+  static unsigned getNumArgElements(FunctionType *FT) {
+    return flattenArg(FT, FT->getNumParams());
+  }
+};
+
+class AssertingSV;
+
+/***********************************************************************
+ * SimpleValue : a non-struct value, possibly inside a struct
+ * See comment at the top of the file.
+ */
+class SimpleValue {
+  Value *V;
+  unsigned Index; // flattened struct index
+public:
+  SimpleValue() : V(nullptr), Index(0) {}
+  // Constructor from a non-struct value
+  SimpleValue(Value *V) : V(V), Index(0) {}
+  // Constructor from a struct value and an already flattened index
+  SimpleValue(Value *V, unsigned Index) : V(V), Index(Index) {}
+  // Constructor from a struct value and unflattened indices (as found in extractelement)
+  SimpleValue(Value *V, ArrayRef<unsigned> Indices) : V(V),
+    Index(IndexFlattener::flatten(cast<StructType>(V->getType()), Indices)) {}
+  // Accessors
+  Value *getValue() const { return V; }
+  unsigned getIndex() const { return Index; }
+  // getType : get the type of the (element) value
+  Type *getType();
+  // Comparisons
+  bool operator==(SimpleValue Rhs) const { return V == Rhs.V && Index == Rhs.Index; }
+  bool operator!=(SimpleValue Rhs) const { return !(*this == Rhs); }
+  bool operator<(SimpleValue Rhs) const {
+    if (V != Rhs.V)
+      return V < Rhs.V;
+    return Index < Rhs.Index;
+  }
+  // Debug dump/print
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() const;
+#endif
+  void print(raw_ostream &OS) const;
+  void printName(raw_ostream &OS) const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, SimpleValue V) {
+  V.print(OS);
+  return OS;
+}
+
+// AssertingSV : like a SimpleValue, but contains an AssertingVH
+class AssertingSV {
+  AssertingVH<Value> V;
+  unsigned Index;
+public:
+  AssertingSV(SimpleValue SV) : V(SV.getValue()), Index(SV.getIndex()) {}
+  SimpleValue get() const { return SimpleValue(V, Index); }
+  Value *getValue() const { return V; }
+  unsigned getIndex() const { return Index; }
+  Type *getType() const { return get().getType(); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() const { get().dump(); }
+#endif
+  void print(raw_ostream &OS) const { get().print(OS); }
+  void printName(raw_ostream &OS) const { get().printName(OS); }
+};
+
+// Segment : a single range of instruction numbers in which a value is
+// live
+struct Segment {
+  enum { WEAK, PHICPY, STRONG };
+  unsigned Strength : 2; // whether it is a weak or phicpy or strong segment
+
+private:
+  unsigned Start : 30; // inclusive start of range
+  unsigned End : 30;   // exclusive end of range
+public:
+  Segment() :  Strength(STRONG), Start(0), End(0) {}
+  Segment(unsigned S, unsigned E, unsigned Strength = STRONG)
+      : Strength(Strength) {
+    assert(E >= S);
+    Start = S;
+    End = E;
+  }
+  unsigned getStart() const noexcept { return Start; }
+  void setStart(unsigned S) noexcept {
+    assert(End >= S);
+    Start = S;
+  }
+  unsigned getEnd() const noexcept{ return End; }
+  void setEnd(unsigned E) noexcept{
+    assert(E >= Start);
+    End = E;
+  }
+  void setStartEnd(unsigned S, unsigned E) noexcept{
+    assert(E >= S);
+    Start = S;
+    End = E;
+  }
+  bool operator<(Segment Rhs) const noexcept{
+    if (Start != Rhs.Start)
+      return Start < Rhs.Start;
+    return End < Rhs.End;
+  }
+
+  // use this via std::hash<Segment> (see end of this file)
+  size_t hash() const noexcept {
+    return hash_combine(Start, End, Strength);
+  }
+  bool operator==(Segment Rhs) const noexcept{
+    return (Start == Rhs.Start) && (End == Rhs.End) &&
+           (Strength == Rhs.Strength);
+  }
+  bool isWeak() const noexcept{ return Strength == WEAK; }
+};
+
+// LiveRange : a collection of Segment structs, in order, describing
+// all points in the program in which a value is live.
+// Also contains a list of each SimpleValue that points to this LiveRange.
+// Also a bitmap of register classes (general, surface, etc) that
+// its def and uses need.
+class LiveRange {
+  friend class llvm::GenXLiveness;
+  typedef SmallVector<Segment, 2> Segments_t;
+  Segments_t Segments;
+  typedef SmallVector<AssertingSV, 2> Values_t;
+  Values_t Values;
+public:
+  // kernel/stack functions that this LR spans across
+  std::set<llvm::Function*> Funcs;
+  unsigned Category :8;
+  unsigned LogAlignment :7;
+  bool DisallowCASC: 1; // disallow call arg special coalescing
+  unsigned Offset :12; // kernel arg offset, else 0
+  LiveRange() : Category(0), LogAlignment(0), DisallowCASC(false), Offset(0) {}
+  // Iterator forwarders for Segments
+  typedef Segments_t::iterator iterator;
+  typedef Segments_t::const_iterator const_iterator;
+  iterator begin() { return Segments.begin(); }
+  iterator end() { return Segments.end(); }
+  const_iterator begin() const { return Segments.begin(); }
+  const_iterator end() const { return Segments.end(); }
+  unsigned size() { return Segments.size(); }
+  void resize(unsigned len) { Segments.resize(len); }
+  // Iterator forwarders for Values.
+  // This is complicated by the Values vector containing AssertingSV, but the
+  // iterator wants to dereference to a Simplevalue.
+  class value_iterator {
+    Values_t::iterator i;
+  public:
+    value_iterator(Values_t::iterator i) : i(i) {}
+    SimpleValue operator*() { return i->get(); }
+    AssertingSV *operator->() { return i; }
+    bool operator==(const value_iterator &Rhs) const { return i == Rhs.i; }
+    bool operator!=(const value_iterator &Rhs) const { return !(*this == Rhs); }
+    value_iterator &operator++() { ++i; return *this; }
+  };
+  Values_t& getValues() { return Values; }
+  value_iterator value_begin() { return Values.begin(); }
+  value_iterator value_end() { return Values.end(); }
+  unsigned value_size() { return Values.size(); }
+  bool value_empty() { return Values.empty(); }
+  // find : return iterator to segment containing Num (including the case
+  // of being equal to the segment's End), or, if in a hole, the
+  // iterator of the next segment, or, if at end, end().
+  iterator find(unsigned Num);
+  void clear() { Segments.clear(); Values.clear(); }
+  void push_back(Segment Seg) { Segments.push_back(Seg); }
+  void push_back(unsigned S, unsigned E) { Segments.push_back(Segment(S, E)); }
+  SimpleValue addValue(SimpleValue V) { Values.push_back(V); return V; }
+  // contains : test whether live range contains instruction number
+  bool contains(unsigned Num) {
+    iterator i = find(Num);
+    return i != end() && i->getEnd() != Num && i->getStart() <= Num;
+  }
+  // getCategory : get the LR's register category
+  unsigned getCategory() const { return Category; }
+  // setCategory : set the LR's register category
+  void setCategory(unsigned Cat) { Category = Cat; }
+  // getOrDefaultCategory : return category; if none, set default
+  unsigned getOrDefaultCategory();
+  // getLogAlignment : get log alignment
+  unsigned getLogAlignment() const { return LogAlignment; }
+  // setAlignmentFromValue : increase alignment if necessary from a value
+  void setAlignmentFromValue(SimpleValue V);
+  // setLogAlignment : set log alignment to greater than implied by the LR's values
+  void setLogAlignment(unsigned Align) { LogAlignment = std::max(LogAlignment, Align); }
+  // addSegment : add a segment to a live range
+  void addSegment(Segment Seg);
+  // setSegmentsFrom : for this live range, clear out its segments
+  //    and copy them from the other live range
+  void setSegmentsFrom(LiveRange *Other);
+  // addSegments : add segments from another LR to this one
+  void addSegments(LiveRange *LR2);
+  // sortAndMerge : after doing some push_backs, sort the segments
+  //    and merge overlapping/adjacent ones
+  void sortAndMerge();
+  // prepareFuncs : fill the Funcs set with kernel or stack functions which this
+  //    LR is alive in
+  void prepareFuncs(FunctionGroupAnalysis *FGA);
+  // getLength : add up the number of instructions covered by this LR
+  unsigned getLength(bool WithWeak);
+  // debug dump/print
+  void dump() const;
+  void print(raw_ostream &OS) const;
+  void printSegments(raw_ostream &OS) const;
+private:
+  void value_clear() { Values.clear(); }
+#ifndef NDEBUG
+  // assertOk : assert that live range's segments are well formed
+  void assertOk();
+#else
+  void assertOk() {}
+#endif
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const LiveRange &LR) {
+  LR.print(OS);
+  return OS;
+}
+
+// CallGraph : the call graph within a FunctionGroup
+class CallGraph {
+  FunctionGroup *FG;
+public:
+  class Node;
+  struct Edge {
+    unsigned Number;
+    CallInst *Call;
+    Node *Callee;
+    bool operator==(Edge Rhs) const { return Number == Rhs.Number; }
+    bool operator!=(Edge Rhs) const { return !(*this == Rhs); }
+    bool operator<(Edge Rhs) const { return Number < Rhs.Number; }
+    Edge() : Number(0), Call(0) {}
+    Edge(unsigned Number, CallInst *Call) : Number(Number), Call(Call) {}
+  };
+  class Node {
+    std::set<Edge> Edges;
+  public:
+    typedef std::set<Edge>::iterator iterator;
+    iterator begin() { return Edges.begin(); }
+    iterator end() { return Edges.end(); }
+    void insert(Edge E) { Edges.insert(E); }
+  };
+private:
+  std::map<Function *, Node> Nodes;
+public:
+  // constructor from FunctionGroup
+  CallGraph(FunctionGroup *FG) : FG(FG) {}
+  // build : build the call graph from the FunctionGroup
+  void build(GenXLiveness *Liveness);
+
+  // getRoot : get the root node
+  Node *getRoot() { return &Nodes[FG->getHead()]; }
+  // getNode : get the node for a Function
+  Node *getNode(Function *F) { return &Nodes[F]; }
+};
+
+} // end namespace genx
+
+class GenXLiveness : public FunctionGroupPass {
+  FunctionGroup *FG;
+  using LiveRangeMap_t = MapVector<genx::SimpleValue, genx::LiveRange *>;
+  LiveRangeMap_t LiveRangeMap;
+  genx::CallGraph *CG;
+  GenXBaling *Baling;
+  GenXNumbering *Numbering;
+  std::map<Function *, Value *> UnifiedRets;
+  std::map<Value *, Function *> UnifiedRetToFunc;
+  std::map<AssertingVH<Value>, Value *> ArgAddressBaseMap;
+public:
+  static char ID;
+  explicit GenXLiveness()
+      : FunctionGroupPass(ID), CG(nullptr), Baling(nullptr),
+        Numbering(nullptr) {}
+  ~GenXLiveness() { clear(); }
+  virtual StringRef getPassName() const override { return "GenX liveness analysis"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunctionGroup(FunctionGroup &FG) override;
+  // setBaling : tell GenXLiveness where GenXBaling is
+  void setBaling(GenXBaling *B) { Baling = B; }
+  // Iterator forwarders.
+  // This gives you an iterator of LiveRangeMap. The ->first field is the
+  // value, and you only get each value once. The ->second field is the
+  // LiveRange pointer, and you may get each one multiple times because
+  // a live range may contain multiple values.
+  typedef LiveRangeMap_t::iterator iterator;
+  typedef LiveRangeMap_t::const_iterator const_iterator;
+  iterator begin() { return LiveRangeMap.begin(); }
+  iterator end() { return LiveRangeMap.end(); }
+  const_iterator begin() const { return LiveRangeMap.begin(); }
+  const_iterator end() const { return LiveRangeMap.end(); }
+  // getLiveRange : get the live range for a Value of non-struct type
+  genx::LiveRange *getLiveRange(Value *V) { return getLiveRange(genx::SimpleValue(V)); }
+  // getLiveRange : get the live range for a genx::SimpleValue
+  genx::LiveRange *getLiveRange(genx::SimpleValue V);
+  // getLiveRangeOrNull : get the live range for a Value, or 0 if none
+  genx::LiveRange *getLiveRangeOrNull(genx::SimpleValue V);
+  const genx::LiveRange *getLiveRangeOrNull(genx::SimpleValue V) const;
+  // getOrCreateLiveRange : get the live range for a Value, or create
+  // a new one if none
+  genx::LiveRange *getOrCreateLiveRange(genx::SimpleValue V);
+  genx::LiveRange *getOrCreateLiveRange(genx::SimpleValue V, unsigned Cat, unsigned LogAlign);
+  // eraseLiveRange : get rid of live range for a Value, possibly multiple
+  //  ones if it is a struct value
+  void eraseLiveRange(Value *V);
+  // eraseLiveRange : get rid of live range for a SimpleValue, if any.
+  // It is assumed that the LiveRange (if any) has no other value atached.
+  void eraseLiveRange(genx::SimpleValue V);
+  // eraseLiveRange : get rid of the specified live range, and remove its
+  // values from the map
+  void eraseLiveRange(genx::LiveRange *LR);
+  // twoAddrInterfere : check whether two live ranges interfere, allowing for single number interference sites at two address ops
+  bool twoAddrInterfere(genx::LiveRange *LR1, genx::LiveRange *LR2);
+  // interfere : test whether two live ranges interfere
+  bool interfere(genx::LiveRange *LR1, genx::LiveRange *LR2);
+  // getSingleInterferenceSites : check whether two live ranges interfere, returning single number interference sites
+  bool getSingleInterferenceSites(genx::LiveRange *LR1, genx::LiveRange *LR2, SmallVectorImpl<unsigned> *Sites);
+  // checkIfOverlappingSegmentsInterfere : given two segments that have been
+  //    shown to overlap, check whether their strengths make them interfere
+  bool checkIfOverlappingSegmentsInterfere(genx::LiveRange *LR1, genx::Segment *S1, genx::LiveRange *LR2, genx::Segment *S2);
+  // coalesce : coalesce two live ranges
+  genx::LiveRange *coalesce(genx::LiveRange *LR1, genx::LiveRange *LR2, bool DisallowCASC);
+  // Set the GenXNumbering pointer for use by live range building
+  void setNumbering(GenXNumbering *N) { Numbering = N; }
+  GenXNumbering *getNumbering() { return Numbering; }
+  // rebuildCallGraph : rebuild GenXLiveness's call graph
+  void rebuildCallGraph();
+  // buildSubroutineLRs : build an LR for each subroutine. Must be called
+  //    before the first BuildLiveRange
+  void buildSubroutineLRs();
+  // buildLiveRange : build live range for given value if it is simple,
+  // or one for each flattened index if it is struct type
+  void buildLiveRange(Value *V);
+  // buildLiveRange : build live range for given value
+  genx::LiveRange *buildLiveRange(genx::SimpleValue V);
+  // rebuildLiveRange : rebuild a live range that only has one value
+  void rebuildLiveRange(genx::LiveRange *LR);
+  // removeBale : remove the bale from its live range, and delete the range if
+  // it now has no values.
+  void removeBale(genx::Bale &B);
+  // removeValue : remove the value from its live range, and delete the
+  // range if it now has no values
+  void removeValue(Value *V);
+  void removeValue(genx::SimpleValue V);
+  // removeValue : remove the value from its live range. Do not delete the
+  // LR if it now has no values.
+  genx::LiveRange *removeValueNoDelete(genx::SimpleValue V);
+  // removeValuesNoDelete : remove all values from the live range, but do not
+  // delete the LR
+  void removeValuesNoDelete(genx::LiveRange *LR);
+  // replaceValue : update liveness such that NewVal has OldVal's live range,
+  // and OldVal does not have one at all.
+  void replaceValue(Value *OldVal, Value *NewVal);
+  void replaceValue(genx::SimpleValue OldVal, genx::SimpleValue(NewVal));
+  // Set the LiveRange for a value in the map
+  void setLiveRange(genx::SimpleValue V, genx::LiveRange *LR);
+  // Get/create the unified return value for a function
+  Value *getUnifiedRet(Function *F);
+  Value *createUnifiedRet(Function *F);
+  // Test whether a value is a unified return value (and return its Function).
+  Function *isUnifiedRet(Value *V);
+  // Move unified return value from OldF to NewF.
+  void moveUnifiedRet(Function *OldF, Function *NewF);
+  // copyInterfere : test whether two live ranges copy-interfere
+  bool copyInterfere(genx::LiveRange *LR1, genx::LiveRange *LR2);
+  // See if V1 is a phi node and V2 wraps round to a phi use in the same BB after V1's def
+  static bool wrapsAround(Value *V1, Value *V2);
+  // Insert a copy of a non-struct value.
+  Instruction *insertCopy(Value *InputVal, genx::LiveRange *LR, Instruction *InsertBefore, const Twine &Name, unsigned Number);
+  // eraseUnusedTree : erase unused tree of instructions, and remove from GenXLiveness
+  void eraseUnusedTree(Instruction *Inst);
+  // setArgAddressBase : set the base value of an argument indirect address
+  void setArgAddressBase(Value *Addr, Value *Base) { ArgAddressBaseMap[Addr] = Base; }
+  // getAddressBase : get the base register of an address
+  Value *getAddressBase(Value *Addr);
+  // isBitCastCoalesced : see if the bitcast has been coalesced away
+  bool isBitCastCoalesced(BitCastInst *BCI);
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const override
+  { return createGenXGroupPrinterPass(O, Banner); }
+  // Debug dump
+  void dump();
+  using Pass::print; // Indicates we aren't replacing base class version of print
+  virtual void print(raw_ostream &OS) const;
+  virtual void releaseMemory() override { clear(); }
+
+private:
+  void clear();
+  unsigned numberInstructionsInFunc(Function *Func, unsigned Num);
+  unsigned getPhiOffset(PHINode *Phi) const;
+  void rebuildLiveRangeForValue(genx::LiveRange *LR, genx::SimpleValue SV);
+  genx::LiveRange *visitPropagateSLRs(Function *F);
+  void merge(genx::LiveRange *LR1, genx::LiveRange *LR2);
+};
+
+void initializeGenXLivenessPass(PassRegistry &);
+
+// Specialize DenseMapInfo for SimpleValue.
+template <> struct DenseMapInfo<genx::SimpleValue> {
+  static inline genx::SimpleValue getEmptyKey() {
+    return genx::SimpleValue(DenseMapInfo<Value *>::getEmptyKey());
+  }
+  static inline genx::SimpleValue getTombstoneKey() {
+    return genx::SimpleValue(DenseMapInfo<Value *>::getTombstoneKey());
+  }
+  static unsigned getHashValue(const genx::SimpleValue &SV) {
+    return DenseMapInfo<Value *>::getHashValue(SV.getValue()) ^
+           DenseMapInfo<unsigned>::getHashValue(SV.getIndex());
+  }
+  static bool isEqual(const genx::SimpleValue &LHS,
+                      const genx::SimpleValue &RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // end namespace llvm
+namespace std {
+template <> struct hash<llvm::genx::Segment> {
+  size_t operator()(llvm::genx::Segment const &x) const noexcept {
+    return x.hash();
+  }
+};
+} // end namespace std
+#endif // GENXLIVENESS_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.cpp
new file mode 100644
index 000000000000..a66bb7785bba
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.cpp
@@ -0,0 +1,200 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// \file
+// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
+// the size is large or is not a compile-time constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GenXLowerAggrCopies.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+
+#define DEBUG_TYPE "GENX_LOWERAGGRCOPIES"
+
+using namespace llvm;
+
+// 8 * 8 * 16 = 8 instructions each read 8 OWords
+static cl::opt<unsigned>
+    ExpandLimitOpt("lower-aggr-copies-expand-limit",
+                   cl::desc("max memcpy/memset/memmove length (in bytes) that "
+                            "is lowered as scalar code"),
+                   cl::init(8 * 8 * 16));
+
+namespace {
+
+// actual analysis class, which is a functionpass
+struct GenXLowerAggrCopies : public FunctionPass {
+  // TODO: more advance analysis
+  //       (at least different values for different arch)
+  const int ExpandLimit;
+  static char ID;
+
+  GenXLowerAggrCopies() : FunctionPass(ID), ExpandLimit(ExpandLimitOpt) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<StackProtector>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  static const unsigned MaxAggrCopySize = 1; // 128;
+
+  StringRef getPassName() const override {
+    return "Lower aggregate copies/intrinsics into loops";
+  }
+
+  template <typename T> void expandMemMov2VecLoadStore(T *MemCall);
+};
+
+char GenXLowerAggrCopies::ID = 0;
+
+bool GenXLowerAggrCopies::runOnFunction(Function &F) {
+  SmallVector<MemIntrinsic *, 4> MemCalls;
+
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  // Collect all aggregate loads and mem* calls.
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+         ++II) {
+      if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) {
+        // Convert intrinsic calls with variable size or with constant size
+        // larger than the MaxAggrCopySize threshold.
+        if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) {
+          if (LenCI->getZExtValue() >= MaxAggrCopySize) {
+            MemCalls.push_back(IntrCall);
+          }
+        } else {
+          MemCalls.push_back(IntrCall);
+        }
+      }
+    }
+  }
+
+  if (MemCalls.size() == 0) {
+    return false;
+  }
+
+  // Transform mem* intrinsic calls.
+  for (MemIntrinsic *MemCall : MemCalls) {
+    bool doLinearExpand = !MemCall->isVolatile() && isa<ConstantInt>(MemCall->getLength()) &&
+      cast<ConstantInt>(MemCall->getLength())->getSExtValue() <= ExpandLimit;
+    if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
+      if (doLinearExpand) {
+        expandMemMov2VecLoadStore(Memcpy);
+      } else {
+        expandMemCpyAsLoop(Memcpy, TTI);
+      }
+    } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
+      if (doLinearExpand) {
+        expandMemMov2VecLoadStore(Memmove);
+      } else {
+        expandMemMoveAsLoop(Memmove);
+      }
+    } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
+      if (doLinearExpand) {
+        llvm::Value *SetVal = Memset->getValue();
+        llvm::Value *LenVal = Memset->getLength();
+        assert(isa<Constant>(LenVal));
+        assert(SetVal->getType()->getScalarSizeInBits() == 8);
+        auto Len = (unsigned)cast<ConstantInt>(LenVal)->getZExtValue();
+        auto VecTy = VectorType::get(SetVal->getType(), Len);
+        Value *WriteOut = UndefValue::get(VecTy);
+        IRBuilder<> IRB(Memset);
+        for (unsigned i = 0; i < Len; ++i) {
+          WriteOut = IRB.CreateInsertElement(WriteOut, SetVal, IRB.getInt32(i));
+        }
+        auto DstAddr = Memset->getRawDest();
+        unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+        auto StorePtrV =
+            IRB.CreateBitCast(DstAddr, VecTy->getPointerTo(dstAS));
+        IRB.CreateStore(WriteOut, StorePtrV);
+      } else {
+        expandMemSetAsLoop(Memset);
+      }
+    }
+    MemCall->eraseFromParent();
+  }
+
+  return true;
+}
+
+template <typename T>
+void GenXLowerAggrCopies::expandMemMov2VecLoadStore(T *MemCall) {
+  IRBuilder<> IRB(MemCall);
+  llvm::Value *LenVal = MemCall->getLength();
+  assert(isa<Constant>(LenVal));
+  auto Len = (unsigned)cast<ConstantInt>(LenVal)->getZExtValue();
+  auto DstPtrV = MemCall->getRawDest();
+  assert(DstPtrV->getType()->isPointerTy());
+  auto I8Ty = cast<PointerType>(DstPtrV->getType())->getElementType();
+  assert(I8Ty->isIntegerTy(8));
+  auto VecTy = VectorType::get(I8Ty, Len);
+  auto SrcAddr = MemCall->getRawSource();
+  unsigned srcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  auto LoadPtrV = IRB.CreateBitCast(SrcAddr, VecTy->getPointerTo(srcAS));
+  auto ReadIn = IRB.CreateLoad(LoadPtrV);
+  auto DstAddr = MemCall->getRawDest();
+  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  auto StorePtrV = IRB.CreateBitCast(DstAddr, VecTy->getPointerTo(dstAS));
+  IRB.CreateStore(ReadIn, StorePtrV);
+}
+
+} // namespace
+
+namespace llvm {
+void initializeGenXLowerAggrCopiesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(GenXLowerAggrCopies, "genx-lower-aggr-copies",
+                "Lower aggregate copies, and llvm.mem* intrinsics into loops",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(GenXLowerAggrCopies, "genx-lower-aggr-copies",
+                "Lower aggregate copies, and llvm.mem* intrinsics into loops",
+                false, false)
+
+FunctionPass *llvm::createGenXLowerAggrCopiesPass() {
+  return new GenXLowerAggrCopies();
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.h
new file mode 100644
index 000000000000..540aaf32614f
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.h
@@ -0,0 +1,41 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file contains the declaration of the VC specific lowering of
+// aggregate copies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_GENX_GENXLOWERAGGRCOPIES_H
+#define LLVM_LIB_TARGET_GENX_GENXLOWERAGGRCOPIES_H
+
+namespace llvm {
+class FunctionPass;
+
+FunctionPass *createGenXLowerAggrCopiesPass();
+}
+
+#endif
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowering.cpp
new file mode 100644
index 000000000000..26e1b4bd8233
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowering.cpp
@@ -0,0 +1,3071 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXLowering
+/// ------------
+///
+/// GenXLowering is a function pass that lowers certain LLVM IR instructions
+/// that the rest of the GenX backend cannot deal with, or to implement peephole
+/// optimizations.
+///
+/// It also performs a few other tasks:
+///
+/// 1. It implements add sinking for a variable index in a region/element
+///    access. This ensures that, in a sequence of operations to calculate a
+///    variable index for a region/element access, any add constant is sunk to
+///    the end, such that it can become a constant offset in an indirect
+///    operand, and give GenXAddressCommoning more chance to common up address
+///    calculations.
+///
+/// 2. It splits struct values where possible, by splitting all struct phi nodes
+///    before running the main pass, then removing an extractvalue by using the
+///    corresponding insertvalue's input instead. Any struct value used as an
+///    arg or return value still remains, and needs to be dealt with by register
+///    allocation.
+///
+/// 3. It widens some byte vector operations to short vector.
+///
+///    Gen has restrictions on byte operands. The jitter copes with that, but
+///    sometimes it needs to do even-odd splitting, which can lead to suboptimal
+///    code if cmps and predicates are involved.
+///    Here we attempt to pick up the common cases by converting a byte
+///    operation to short.
+///
+///    Note that we might end up with the extends being baled into the
+///    instruction anyway, resulting in a byte operation in vISA.
+///
+/// 4. Certain uses of shufflevector are lowered:
+///
+///    a. a splat (copy of one element across a vector);
+///    b. a boolean slice (extract of a subvector) becomes rdpredregion;
+///    c. a boolean unslice (insert subvector) becomes wrpredregion.
+///    d. non-boolean shufflevector is lowered to sequence of rd/wrregions
+///
+///    The only one case of shufflevector allowed is shufflevector of
+///    predicate and undef with replicated mask.
+///
+/// 5. A Trunc is lowered to a bitcast then a region/element read with a stride.
+///    GenXCoalescing will coalesce the bitcast, and possibly bale in the region
+///    read, so this will hopefully save an instruction or two.
+///
+/// 6. Certain floating point comparison instructions are lowered.
+///
+/// **IR restriction**: LLVM IR instructions not supported after this pass:
+///
+/// * insertelement
+/// * extractelement
+/// * trunc
+/// * zext/sext/uitofp from (vector of) i1
+/// * select on vector of i1
+/// * ``llvm.uadd.with.overflow`` (the other
+///   overflowing arithmetic intrinsics are not allowed by the GenX backend
+///   anyway.)
+///
+///
+/// **IR restriction**: all gather/scatter/atomic must have the width supported
+/// by the hardware target.
+///
+/// **IR restriction**: rdpredregion intrinsic (which is generated by this pass
+/// from certain cases of shufflevector, and represents a use of part of a
+/// predicate) can only be used in select, wrregion, wrpredpredregion.
+///
+/// **IR restriction**: wrpredregion intrinsic (which is generated by this pass
+/// from certain cases of shufflevector, and represents the write of part of a
+/// predicate) must have a compare as its "new value" input.
+///
+/// **IR restriction**: No phi node of struct type after this pass. This is only
+/// a general rule; subsequent passes have been known to reintroduce them so
+/// GenXLiveness has another go at splitting them up.
+///
+//===----------------------------------------------------------------------===//
+
+#include "GenX.h"
+#include "GenXGotoJoin.h"
+#include "GenXIntrinsics.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "GenXVisa.h"
+#include "visa_igc_common_header.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+
+using namespace llvm;
+using namespace genx;
+
+static cl::opt<bool>
+    EnableGenXByteWidening("enable-genx-byte-widening", cl::init(true),
+                           cl::Hidden, cl::desc("Enable GenX byte widening."));
+
+namespace {
+
+// GenXLowering : legalize execution widths and GRF crossing
+class GenXLowering : public FunctionPass {
+  DominatorTree *DT = nullptr;
+  const GenXSubtarget *ST = nullptr;
+  SmallVector<Instruction *, 8> ToErase;
+
+public:
+  static char ID;
+  explicit GenXLowering() : FunctionPass(ID), DT(nullptr) {}
+  virtual StringRef getPassName() const { return "GenX lowering"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunction(Function &F);
+  static bool splitStructPhi(PHINode *Phi);
+
+private:
+  bool splitGatherScatter(CallInst *CI, unsigned IID);
+  bool processTwoAddressOpnd(CallInst *CI);
+  bool processInst(Instruction *Inst);
+  bool lowerRdRegion(Instruction *Inst);
+  bool lowerWrRegion(Instruction *Inst);
+  bool lowerRdPredRegion(Instruction *Inst);
+  bool lowerWrPredRegion(Instruction *Inst);
+  bool lowerInsertElement(Instruction *Inst);
+  bool lowerExtractElement(Instruction *Inst);
+  Value *scaleInsertExtractElementIndex(Value *IdxVal, Type *ElTy,
+                                        Instruction *InsertBefore);
+  bool lowerTrunc(Instruction *Inst);
+  bool lowerCast(Instruction *Inst);
+  bool lowerBoolScalarSelect(SelectInst *SI);
+  bool lowerBoolVectorSelect(SelectInst *SI);
+  bool lowerBoolShuffle(ShuffleVectorInst *Inst);
+  bool lowerBoolSplat(ShuffleVectorInst *SI, Value *In, unsigned Idx);
+  bool lowerSelect(SelectInst* SI);
+  bool lowerShuffle(ShuffleVectorInst *Inst);
+  void lowerShuffleSplat(ShuffleVectorInst *SI,
+                         ShuffleVectorAnalyzer::SplatInfo Splat);
+  bool lowerShuffleToSelect(ShuffleVectorInst *Inst);
+  void lowerShuffleToMove(ShuffleVectorInst *SI);
+  bool lowerShr(Instruction *Inst);
+  bool lowerExtractValue(ExtractValueInst *Inst);
+  bool lowerInsertValue(InsertValueInst *Inst);
+  bool lowerUAddWithOverflow(CallInst *CI);
+  bool lowerCtpop(CallInst *CI);
+  bool lowerFCmpInst(FCmpInst *Inst);
+  bool widenByteOp(Instruction *Inst);
+  bool lowerLoadStore(Instruction *Inst);
+  bool lowerMul64(Instruction *Inst);
+  bool lowerTrap(CallInst *CI);
+};
+
+} // end namespace
+
+char GenXLowering::ID = 0;
+namespace llvm {
+void initializeGenXLoweringPass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXLowering, "GenXLowering", "GenXLowering", false,
+                      false)
+INITIALIZE_PASS_END(GenXLowering, "GenXLowering", "GenXLowering", false, false)
+
+FunctionPass *llvm::createGenXLoweringPass() {
+  initializeGenXLoweringPass(*PassRegistry::getPassRegistry());
+  return new GenXLowering;
+}
+
+void GenXLowering::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addPreserved<GenXModule>();
+}
+
+/***********************************************************************
+ * GenXLowering::runOnFunction : process one function to
+ *    lower instructions as required for GenX backend.
+ *
+ * This does a postordered depth first traversal of the CFG,
+ * processing instructions within a basic block in reverse, to
+ * ensure that we see a def after its uses (ignoring phi node uses).
+ * This helps peephole optimizations which generally want to be
+ * approached from the top down. For example, add sinking in the index
+ * of an indirect region/element wants to see the trunc before the trunc
+ * is lowered to a bitcast and an element access.
+ */
+bool GenXLowering::runOnFunction(Function &F) {
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  ST = P ? P->getSubtarget() : nullptr;
+  // First split any phi nodes with struct type.
+  splitStructPhis(&F);
+  // Create a list of basic blocks in the order we want to process them, before
+  // we start the lowering. This is because lowering can split a basic block.
+  SmallVector<BasicBlock *, 8> BBs;
+  for (auto i = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock());
+       i != e; ++i)
+    BBs.push_back(*i);
+  // Process each basic block.
+  for (auto i = BBs.begin(), e = BBs.end(); i != e; ++i) {
+    BasicBlock *BB = *i;
+    // The effect of this loop is that we process the instructions in reverse
+    // order, and we re-process anything inserted before the instruction
+    // being processed.
+    for (Instruction *Inst = BB->getTerminator();;) {
+      processInst(Inst);
+      BasicBlock *Parent = Inst->getParent();
+      if (Inst != &Parent->front())
+        Inst = Inst->getPrevNode();
+      else {
+        if (Parent == BB)
+          break;
+        // We have reached the start of the basic block, but it is a different
+        // basic block to BB, so lowering must have split a BB. Just go back to
+        // the end of the previous one.
+        Inst = Parent->getPrevNode()->getTerminator();
+      }
+    }
+  }
+  // Erase the instructions that we saved in ToErase.
+  for (SmallVectorImpl<Instruction *>::iterator i = ToErase.begin(),
+                                                e = ToErase.end();
+       i != e; ++i)
+    (*i)->eraseFromParent();
+  ToErase.clear();
+  return true;
+}
+
+// Optimize two address operands if any.
+//
+// An instruction with a two address opernd should be predicated. If predicate
+// is a constant splat, then the old value will be over-written. In this case,
+// replace the old value with undef which allows more optimizations to kick in.
+//
+bool GenXLowering::processTwoAddressOpnd(CallInst *CI) {
+  int OpNum = getTwoAddressOperandNum(CI);
+  // Skip write regions whose OpNum is 0.
+  if (OpNum > 0) {
+    Type *Ty = CI->getArgOperand(OpNum)->getType();
+    assert(Ty == CI->getType() && "two address op type out of sync");
+
+    for (unsigned i = 0; i < CI->getNumArgOperands(); ++i) {
+      auto Op = dyn_cast<Constant>(CI->getArgOperand(i));
+      // Check if the predicate operand is all true.
+      if (Op && Op->getType()->getScalarSizeInBits() == 1) {
+        if (Op->getType()->isVectorTy())
+          Op = Op->getSplatValue();
+        if (Op && Op->isOneValue()) {
+          CI->setOperand(OpNum, UndefValue::get(Ty));
+          return true;
+        }
+        return false;
+      }
+    }
+  }
+
+  return false;
+}
+
+// Check whether given intrinsic is new load
+// without predicate and old value arguments.
+static bool isNewLoadInst(CallInst *Inst) {
+  unsigned IID = GenXIntrinsic::getGenXIntrinsicID(Inst);
+  switch (IID) {
+  case GenXIntrinsic::genx_gather4_scaled2:
+  case GenXIntrinsic::genx_gather_scaled2:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Find single wrregion user of load instruction.
+// Returns nullptr on failure.
+static CallInst *getLoadWrregion(CallInst *Inst) {
+  assert(isNewLoadInst(Inst) && "Expected new load intrinsics");
+  if (Inst->getNumUses() != 1)
+    return nullptr;
+
+  auto *WrR = dyn_cast<CallInst>(Inst->user_back());
+  if (!WrR)
+    return nullptr;
+  return GenXIntrinsic::isWrRegion(WrR) ? WrR : nullptr;
+}
+
+// Find single select user of load instruction.
+// Returns nullptr on failure.
+// TODO: maybe just lower every select to wrregion in lowerSelect?
+static SelectInst *getLoadSelect(CallInst *Inst) {
+  assert(isNewLoadInst(Inst) && "Expected new load intrinsics");
+  if (Inst->getNumUses() != 1)
+    return nullptr;
+
+  auto *SI = dyn_cast<SelectInst>(Inst->user_back());
+  if (!SI)
+    return nullptr;
+  // TODO: handle inverted selects.
+  // Need to regenerate mask in this case.
+  if (SI->getTrueValue() != Inst)
+    return nullptr;
+  return SI;
+}
+
+// Generate predicate for wrregion of splitted load.
+// Returns new predicate.
+static Value *generatePredicateForLoadWrregion(
+    Value *OldPred, unsigned Offset, unsigned Width, unsigned NumChannels,
+    Instruction *InsertBefore, const DebugLoc &DL, const Twine &Name) {
+  if (isa<ConstantInt>(OldPred))
+    return OldPred;
+
+  Value *Pred = OldPred;
+  // If old predicate is result of rdpredregion or shufflevector then
+  // we can reuse their predicate and offset to avoid double read of predicate.
+  if (GenXIntrinsic::getGenXIntrinsicID(OldPred) == GenXIntrinsic::genx_rdpredregion) {
+    auto *OldPredInst = cast<CallInst>(OldPred);
+    Offset += cast<ConstantInt>(OldPredInst->getArgOperand(1))->getZExtValue();
+    Pred = OldPredInst->getArgOperand(0);
+  } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(OldPred)) {
+    Offset +=
+        ShuffleVectorAnalyzer::getReplicatedSliceDescriptor(SVI).InitialOffset;
+    Pred = SVI->getOperand(0);
+  }
+
+  // Replicate mask across channels.
+  SmallVector<Constant *, 16> NewMaskVals(Width);
+  unsigned ChannelWidth = Width / NumChannels;
+  Type *Int32Ty = IntegerType::getInt32Ty(Pred->getContext());
+  for (unsigned i = 0; i < NumChannels; ++i)
+    std::generate_n(NewMaskVals.begin() + ChannelWidth * i, ChannelWidth,
+                    [Int32Ty, Offset]() mutable {
+                      return ConstantInt::get(Int32Ty, Offset++);
+                    });
+  Constant *NewMask = ConstantVector::get(NewMaskVals);
+
+  Value *Undef = UndefValue::get(Pred->getType());
+  auto *Res = new ShuffleVectorInst(Pred, Undef, NewMask, Name, InsertBefore);
+  Res->setDebugLoc(DL);
+  return Res;
+}
+
+// Generate partial write for result of splitted 1-channel load instruction.
+// Initially we could have following sequence for illegal load (on gather_scaled example):
+//   res = gather_scaled <32>
+//   mask = rdpredregion <32> pred, offset
+//   newV = wrregion <32> oldV, res, wroffset, mask
+// After splitting we want to get as less extra code as possible.
+// To achieve this we generate following pattern:
+// bale {
+//   res1 = gather_scaled <16>
+//   mask1 = rdpredregion <16> pred, offset
+//   partialV = wrregion <16> oldV, res1, mask1
+// }
+// bale {
+//   res2 = gather_scaled <16>
+//   mask2 = rdpredregion <16> pred, offset + 16
+//   newV = wrregion <16> partialV, res2, wroffset + 16 * elemsize, mask2
+// }
+// Bale markers show how this will be baled later.
+static Value *generate1ChannelWrrregion(Value *Target, unsigned InitialOffset,
+                                        CallInst *Load, Value *OldPred,
+                                        unsigned SplitNum,
+                                        Instruction *InsertBefore) {
+  const DebugLoc &DL = Load->getDebugLoc();
+  Type *LoadType = Load->getType();
+  unsigned LoadWidth = LoadType->getVectorNumElements();
+
+  Value *Pred =
+      generatePredicateForLoadWrregion(OldPred, LoadWidth * SplitNum, LoadWidth,
+                                       1, InsertBefore, DL, "load1.pred.split");
+  Region WrR(LoadType);
+  WrR.Mask = Pred;
+  WrR.Offset = InitialOffset +
+               LoadWidth * SplitNum * (LoadType->getScalarSizeInBits() / 8);
+  return WrR.createWrRegion(Target, Load, "load1.join", InsertBefore, DL);
+}
+
+// Generate partial write for result of splitted N-channel load.
+// For channelled loads we need to also shuffle result of splitted
+// instructions to write back them to destination in expected order.
+// Temporary splits should always be predicated in case of atomics
+// because latter load and store at the same time.
+// Example for gather4_scaled (with two channels enabled). Before:
+//   res = gather4_scaled <32> RG
+//   mask = rdpredregion <64> pred, offset ; mask is replicated across channels
+//   newV = wrregion <64> oldV, res, wroffset, mask
+// After:
+// bale {
+//   res1temp = gather4_scaled <16> RG ; create temporary (unnecessary in case of non-atomics)
+//   splitmask1 = rdpredregion <32> pred, offset ; replicated
+//   res1 = wrregion <32> undef, res1temp, 0, splitmask1
+// }
+// bale {
+//   res1R = rdregion <16> res1, 0
+//   mask1R = rdpredregion <16> pred, offset ; same for all channels
+//   partialVR = wrregion <16> oldV, res1R, wroffset, mask1R
+// }
+// bale {
+//   res1G = rdregion <16> res1, 16 * elemsize
+//   mask1G = rdpredregion <16> pred, offset
+//   partialV = wrregion <16> partialVR, res1G, wroffset + 32 * elemsize, mask1G
+// }
+// bale {
+//   res2temp = gather4_scaled <16> RG ; second temporary
+//   splitmask2 = rdpredregion <32> pred, offset + 16
+//   res2 = wrregion <32> undef, res2temp, 0, splitmask2
+// }
+// bale {
+//   res2R = rdregion <16> res2, 0
+//   mask2R = rdpredregion <16> pred, offset + 16
+//   newVR = wrregion <16> partialV, res2R, wroffset + 16 * elemsize, mask2R
+// }
+// bale {
+//   res2G = rdregion <16> res2, 16 * elemsize
+//   mask2G = rdpredregion <16> pred, offset + 16
+//   newV = wrregion <16> newVR, res2G, wroffset + 48 * elemsize, mask2G
+// }
+// As it can be noticed, splitting of channeled loads is quite expensive.
+// We should hope that later passes (like region collapsing) can optimize it
+// by analyzing how resulting value was assembled.
+static Value *generateNChannelWrregion(Value *Target, unsigned InitialOffset,
+                                       CallInst *Load, Value *OldPred,
+                                       unsigned SplitNum, unsigned NumSplits,
+                                       unsigned NumChannels,
+                                       Instruction *InsertBefore) {
+  const DebugLoc &DL = Load->getDebugLoc();
+  Type *LoadType = Load->getType();
+  unsigned LoadWidth = LoadType->getVectorNumElements();
+  unsigned ChannelWidth = LoadWidth / NumChannels;
+  unsigned MaskOffset = ChannelWidth * SplitNum;
+
+  // Generate temporary for load.
+  Value *Pred = generatePredicateForLoadWrregion(
+      OldPred, MaskOffset, LoadWidth, NumChannels, InsertBefore, DL, "loadN.pred.split");
+  Region WrR(LoadType);
+  WrR.Mask = Pred;
+  Value *SplitRes = WrR.createWrRegion(UndefValue::get(LoadType), Load,
+                                       "loadN.split", InsertBefore, DL);
+
+  // Generate shuffle writes to the target.
+  unsigned ElemByteSize = LoadType->getScalarSizeInBits() / 8;
+  Type *ShuffleType = VectorType::get(LoadType->getScalarType(), ChannelWidth);
+  Region ChannelRdR(ShuffleType);
+  Region ChannelWrR(ShuffleType);
+  Value *ResChannel = nullptr;
+  for (unsigned i = 0; i < NumChannels; ++i) {
+    ChannelRdR.Offset = ChannelWidth * i * ElemByteSize;
+    ResChannel = ChannelRdR.createRdRegion(SplitRes, "loadN.channel.read.join",
+                                           InsertBefore, DL);
+    Pred = generatePredicateForLoadWrregion(OldPred, MaskOffset, ChannelWidth,
+                                            1, InsertBefore, DL,
+                                            "loadN.channel.pred.join");
+    ChannelWrR.Offset =
+        InitialOffset +
+        (ChannelWidth * SplitNum + ChannelWidth * NumSplits * i) * ElemByteSize;
+    ChannelWrR.Mask = Pred;
+    Target = ChannelWrR.createWrRegion(Target, ResChannel, "loadN.channel.join",
+                                       InsertBefore, DL);
+  }
+  return Target;
+}
+
+// Get target for wrregions of splitted load.
+// Returns tuple consisted of:
+//  1. Target for wrregions
+//  2. Predicate
+//  3. Initial offset of target
+//  4. Instruction to replace later
+static std::tuple<Value *, Value *, unsigned, Instruction *>
+getLoadTarget(CallInst *Load, const GenXSubtarget *ST) {
+  Value *LoadPred;
+  if (CallInst *LoadWrr = getLoadWrregion(Load)) {
+    // If we found wrregion user, then use its predicate for splitted instructions.
+    LoadPred =
+        LoadWrr->getArgOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum);
+
+    // If wrregion can be represented as raw operand, we can reuse its target and offset.
+    if (genx::isValueRegionOKForRaw(LoadWrr, true /* IsWrite */, ST)) {
+      // TODO: mark wrregion to be erased once issue with ToErase and
+      // iteration order will be resolved.
+      Value *Target =
+          LoadWrr->getArgOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+      Value *Offset =
+          LoadWrr->getArgOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+      unsigned InitialOffset = cast<ConstantInt>(Offset)->getZExtValue();
+      return {Target, LoadPred, InitialOffset, LoadWrr};
+    }
+  } else if (SelectInst *SI = getLoadSelect(Load)) {
+    LoadPred = SI->getCondition();
+    Value *Target = SI->getFalseValue();
+    return {Target, LoadPred, 0, SI};
+  } else {
+    // No wrregion user, load is not predicated.
+    LoadPred = ConstantInt::get(IntegerType::getInt1Ty(Load->getContext()), 1);
+  }
+
+  // Create new target for load.
+  Value *Target = UndefValue::get(Load->getType());
+  return {Target, LoadPred, 0, Load};
+}
+
+/***********************************************************************
+ * splitGatherScatter : lower gather/scatter/atomic to the width support
+ * by the hardware platform.
+ *
+ * This performs two functions:
+ *
+ * 1. If the operation is wider than what hardware can support, splits it
+ *    into the legal width.
+ *
+ * 2. For typed gather4/scatter4, when r or both v and r are zero, replace
+ *    with undef so that they are not encoded in the vISA instruction and the
+ *    message skips them.
+ */
+bool GenXLowering::splitGatherScatter(CallInst *CI, unsigned IID) {
+  enum {
+    MASK_IDX = 0,
+    PRED_IDX = 1,
+    SURF_IDX = 2,
+    U_IDX = 3,
+    DATA_IDX = 6,
+    NONEED = 11
+  };
+
+  unsigned MaskIdx = NONEED;
+  unsigned PredIdx = NONEED;
+  unsigned AddrIdx = NONEED;
+  unsigned DataIdx = NONEED;
+  unsigned AtomicSrcIdx = NONEED;
+  bool IsTyped = false;
+  int AtomicNumSrc = (-1); // -1 means not-an-atomic
+
+  switch (IID) {
+  case GenXIntrinsic::genx_typed_atomic_add:
+  case GenXIntrinsic::genx_typed_atomic_and:
+  case GenXIntrinsic::genx_typed_atomic_fmax:
+  case GenXIntrinsic::genx_typed_atomic_fmin:
+  case GenXIntrinsic::genx_typed_atomic_imax:
+  case GenXIntrinsic::genx_typed_atomic_imin:
+  case GenXIntrinsic::genx_typed_atomic_max:
+  case GenXIntrinsic::genx_typed_atomic_min:
+  case GenXIntrinsic::genx_typed_atomic_or:
+  case GenXIntrinsic::genx_typed_atomic_sub:
+  case GenXIntrinsic::genx_typed_atomic_xchg:
+  case GenXIntrinsic::genx_typed_atomic_xor:
+    AtomicSrcIdx = 2;
+    PredIdx = 0;
+    AddrIdx = 3;
+    IsTyped = true;
+    AtomicNumSrc = 1;
+    break;
+  case GenXIntrinsic::genx_typed_atomic_dec:
+  case GenXIntrinsic::genx_typed_atomic_inc:
+    PredIdx = 0;
+    AddrIdx = 2;
+    IsTyped = true;
+    AtomicNumSrc = 0;
+    break;
+  case GenXIntrinsic::genx_typed_atomic_cmpxchg:
+  case GenXIntrinsic::genx_typed_atomic_fcmpwr:
+    AtomicSrcIdx = 2;
+    PredIdx = 0;
+    AddrIdx = 4;
+    IsTyped = true;
+    AtomicNumSrc = 2;
+    break;
+  case GenXIntrinsic::genx_scatter4_typed:
+  case GenXIntrinsic::genx_gather4_typed:
+    DataIdx = DATA_IDX;
+    MaskIdx = MASK_IDX;
+    PredIdx = PRED_IDX;
+    AddrIdx = U_IDX;
+    IsTyped = true;
+    break;
+  case GenXIntrinsic::genx_scatter4_scaled:
+  case GenXIntrinsic::genx_gather4_scaled:
+    DataIdx = 6;
+    PredIdx = 0;
+    MaskIdx = 1;
+    AddrIdx = 5;
+    break;
+  case GenXIntrinsic::genx_gather4_scaled2:
+    MaskIdx = 0;
+    AddrIdx = 4;
+    break;
+  case GenXIntrinsic::genx_svm_scatter4_scaled:
+  case GenXIntrinsic::genx_svm_gather4_scaled:
+    DataIdx = 5;
+    PredIdx = 0;
+    MaskIdx = 1;
+    AddrIdx = 4;
+    break;
+  case GenXIntrinsic::genx_scatter_scaled:
+  case GenXIntrinsic::genx_gather_scaled:
+    DataIdx = 6;
+    PredIdx = 0;
+    AddrIdx = 5;
+    break;
+  case GenXIntrinsic::genx_gather_scaled2:
+    AddrIdx = 4;
+    break;
+  case GenXIntrinsic::genx_svm_scatter:
+  case GenXIntrinsic::genx_svm_gather:
+    DataIdx = 3;
+    PredIdx = 0;
+    AddrIdx = 2;
+    break;
+  case GenXIntrinsic::genx_svm_atomic_dec:
+  case GenXIntrinsic::genx_svm_atomic_inc:
+    DataIdx = 2;
+    PredIdx = 0;
+    AddrIdx = 1;
+    AtomicNumSrc = 0;
+  case GenXIntrinsic::genx_svm_atomic_add:
+  case GenXIntrinsic::genx_svm_atomic_and:
+  case GenXIntrinsic::genx_svm_atomic_fmax:
+  case GenXIntrinsic::genx_svm_atomic_fmin:
+  case GenXIntrinsic::genx_svm_atomic_imax:
+  case GenXIntrinsic::genx_svm_atomic_imin:
+  case GenXIntrinsic::genx_svm_atomic_max:
+  case GenXIntrinsic::genx_svm_atomic_min:
+  case GenXIntrinsic::genx_svm_atomic_or:
+  case GenXIntrinsic::genx_svm_atomic_sub:
+  case GenXIntrinsic::genx_svm_atomic_xchg:
+  case GenXIntrinsic::genx_svm_atomic_xor:
+    DataIdx = 3;
+    PredIdx = 0;
+    AddrIdx = 1;
+    AtomicSrcIdx = 2;
+    AtomicNumSrc = 1;
+    break;
+  case GenXIntrinsic::genx_svm_atomic_cmpxchg:
+  case GenXIntrinsic::genx_svm_atomic_fcmpwr:
+    DataIdx = 4;
+    PredIdx = 0;
+    AddrIdx = 1;
+    AtomicSrcIdx = 2;
+    AtomicNumSrc = 2;
+    break;
+  case GenXIntrinsic::genx_dword_atomic_add:
+  case GenXIntrinsic::genx_dword_atomic_and:
+  case GenXIntrinsic::genx_dword_atomic_fmax:
+  case GenXIntrinsic::genx_dword_atomic_fmin:
+  case GenXIntrinsic::genx_dword_atomic_imax:
+  case GenXIntrinsic::genx_dword_atomic_imin:
+  case GenXIntrinsic::genx_dword_atomic_max:
+  case GenXIntrinsic::genx_dword_atomic_min:
+  case GenXIntrinsic::genx_dword_atomic_or:
+  case GenXIntrinsic::genx_dword_atomic_sub:
+  case GenXIntrinsic::genx_dword_atomic_xchg:
+  case GenXIntrinsic::genx_dword_atomic_xor:
+    DataIdx = 4;
+    PredIdx = 0;
+    AddrIdx = 2;
+    AtomicSrcIdx = 3;
+    AtomicNumSrc = 1;
+    break;
+  case GenXIntrinsic::genx_dword_atomic_cmpxchg:
+  case GenXIntrinsic::genx_dword_atomic_fcmpwr:
+    DataIdx = 5;
+    PredIdx = 0;
+    AddrIdx = 2;
+    AtomicSrcIdx = 3;
+    AtomicNumSrc = 2;
+    break;
+  case GenXIntrinsic::genx_dword_atomic_dec:
+  case GenXIntrinsic::genx_dword_atomic_inc:
+    DataIdx = 3;
+    PredIdx = 0;
+    AddrIdx = 2;
+    AtomicNumSrc = 0;
+    break;
+
+  default:
+    return false;
+  }
+
+  // nulling unused inputs for typed gather/scatter/atomic
+  if (IsTyped) {
+    Constant *V = dyn_cast<Constant>(CI->getArgOperand(AddrIdx + 1));
+    Constant *R = dyn_cast<Constant>(CI->getArgOperand(AddrIdx + 2));
+    // Only continue when R is known to be zero.
+    if (R && R->isNullValue()) {
+      CI->setOperand(AddrIdx + 2, UndefValue::get(R->getType()));
+      if (V && V->isNullValue())
+        CI->setOperand(AddrIdx + 1, UndefValue::get(V->getType()));
+    }
+    // check if LOD is zero for atomic
+    if (AtomicNumSrc >= 0) {
+      Constant *LOD = dyn_cast<Constant>(CI->getArgOperand(AddrIdx + 3));
+      if (LOD && LOD->isNullValue())
+        CI->setOperand(AddrIdx + 3, UndefValue::get(LOD->getType()));
+    }
+  }
+  // Deduce intrinsic width: check predicate if exists, then check address vector.
+  unsigned WidthOperand;
+  if (PredIdx != NONEED)
+    WidthOperand = PredIdx;
+  else if (AddrIdx != NONEED)
+    WidthOperand = AddrIdx;
+  else
+    llvm_unreachable("Cannot infer execution width of intrinsic (checked pred and addr operands)");
+  auto Width = CI->getArgOperand(WidthOperand)->getType()->getVectorNumElements();
+  unsigned TargetWidth = IsTyped ? 8 : 16;
+  if (Width <= TargetWidth)
+    return false;
+  assert((Width % TargetWidth) == 0);
+  auto NumSplits = Width / TargetWidth;
+  assert(NumSplits == 2 || NumSplits == 4);
+  unsigned NumChannels = 1;
+  if (MaskIdx != NONEED) {
+    NumChannels = (unsigned)cast<ConstantInt>(CI->getArgOperand(MaskIdx))
+                      ->getZExtValue();
+    NumChannels = (NumChannels & 1) + ((NumChannels & 2) >> 1) +
+                  ((NumChannels & 4) >> 2) + ((NumChannels & 8) >> 3);
+  }
+  
+  unsigned NumBlks = 1;
+  if (IID == GenXIntrinsic::genx_svm_scatter ||
+      IID == GenXIntrinsic::genx_svm_gather) {
+    NumBlks = (unsigned)cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    NumBlks = (1 << NumBlks);
+    auto ElmSz = CI->getArgOperand(DataIdx)->getType()->getScalarSizeInBits() / 8;
+    if (ElmSz == 1 && NumBlks < 4)
+      NumBlks = 4;
+    else if (ElmSz == 2 && NumBlks < 2)
+      NumBlks = 2;
+  }
+  const DebugLoc &DL = CI->getDebugLoc();
+  Value *NewResult = nullptr;
+  if (CI->getType() &&
+	  CI->getType()->isVectorTy() &&
+	  CI->getType()->getVectorNumElements() >= Width * NumChannels * NumBlks) {
+    if (DataIdx != NONEED)
+      NewResult = CI->getArgOperand(DataIdx);
+    else
+      NewResult = UndefValue::get(CI->getType());
+  }
+
+  bool IsNewLoad = isNewLoadInst(CI);
+  Value *LoadPred = nullptr;
+  unsigned InitialOffset = 0;
+  Instruction *InstToReplace = CI;
+  if (IsNewLoad)
+    std::tie(NewResult, LoadPred, InitialOffset, InstToReplace) =
+        getLoadTarget(CI, ST);
+
+  for (unsigned i = 0; i < NumSplits; ++i) {
+    SmallVector<Value *, 8> Args;
+    // initialize the args with the old values
+    for (unsigned ArgI = 0; ArgI < CI->getNumArgOperands(); ++ArgI)
+      Args.push_back(CI->getArgOperand(ArgI));
+    // Predicate
+    if (PredIdx != NONEED) {
+      Value *V = CI->getArgOperand(PredIdx);
+      if (auto C = dyn_cast<Constant>(V))
+        Args[PredIdx] = getConstantSubvector(C, i * TargetWidth, TargetWidth);
+      else
+        Args[PredIdx] = Region::createRdPredRegion(
+          V, i * TargetWidth, TargetWidth, "predsplit", CI, DL);
+    }
+    // address source
+    unsigned NumAddrs = 1;
+    if (IsTyped)
+      NumAddrs = (AtomicNumSrc >= 0) ? 4 : 3;
+    for (unsigned AddrI = 0; AddrI < NumAddrs; ++AddrI) {
+      Value *V = CI->getArgOperand(AddrIdx + AddrI);
+      Region R(V);
+      R.Width = R.NumElements = TargetWidth;
+      R.Offset = i * TargetWidth * V->getType()->getScalarSizeInBits()/8; // in bytes
+      Args[AddrIdx + AddrI] = R.createRdRegion(V, "addrsplit", CI, DL);
+    }
+    // data source
+    // We need to construct a new vector with 8 elements per enabled
+    // color.
+    if (DataIdx != NONEED) {
+      Value *V = CI->getArgOperand(DataIdx);
+      auto DataTy = VectorType::get(V->getType()->getScalarType(),
+                                    TargetWidth * NumChannels * NumBlks);
+      auto ElmSz = V->getType()->getScalarSizeInBits() / 8;
+      Value *NewVec = UndefValue::get(DataTy);
+      if (!isa<UndefValue>(V)) {
+        for (unsigned Channel = 0; Channel < NumChannels; ++Channel) {
+          Region RdR(V);
+          RdR.Width = RdR.NumElements = TargetWidth * NumBlks;
+          RdR.Offset = 4 * (Width * NumBlks * Channel + TargetWidth * NumBlks * i);
+          auto Rd = RdR.createRdRegion(V, "datasplit", CI, DL);
+          if (NumChannels > 1) {
+            Region WrR(DataTy);
+            WrR.Width = WrR.NumElements = TargetWidth * NumBlks;
+            WrR.Offset = ElmSz * TargetWidth * NumBlks * Channel;
+            NewVec = WrR.createWrRegion(NewVec, Rd, "datasplit", CI, DL);
+          } else
+            NewVec = Rd;
+        }
+      }
+      Args[DataIdx] = NewVec;
+    }
+	// atomic source operands
+    if (AtomicSrcIdx != NONEED) {
+      for (int SrcI = 0; SrcI < AtomicNumSrc; ++SrcI) {
+        Value *V = CI->getArgOperand(AtomicSrcIdx + SrcI);
+        Region R(V);
+        R.Width = R.NumElements = TargetWidth;
+        R.Offset = i * TargetWidth * V->getType()->getScalarSizeInBits()/8; // in bytes
+        Args[AtomicSrcIdx + SrcI] = R.createRdRegion(V, "addrsplit", CI, DL);
+	    }
+    }
+    // now create the new narrower instruction
+    if (NewResult) {
+      Type *DstTy = nullptr;
+      if (DataIdx != NONEED)
+        DstTy = Args[DataIdx]->getType();
+      else {
+        DstTy = VectorType::get(CI->getType()->getScalarType(),
+                                      TargetWidth * NumBlks * NumChannels);
+      }
+      SmallVector<Type *, 4> Tys = {DstTy};
+      if (PredIdx != NONEED)
+        Tys.push_back(Args[PredIdx]->getType());
+      if (AddrIdx != NONEED)
+        Tys.push_back(Args[AddrIdx]->getType());
+      auto Decl = GenXIntrinsic::getAnyDeclaration(
+          CI->getParent()->getParent()->getParent(), IID, Tys);
+      auto *Gather = CallInst::Create(Decl, Args, CI->getName() + ".split", CI);
+      Gather->setDebugLoc(DL);
+      if (IsNewLoad) {
+        if (NumChannels == 1)
+          NewResult = generate1ChannelWrrregion(NewResult, InitialOffset,
+                                                Gather, LoadPred, i, CI);
+        else
+          NewResult =
+              generateNChannelWrregion(NewResult, InitialOffset, Gather,
+                                       LoadPred, i, NumSplits, NumChannels, CI);
+        continue;
+      }
+      // Join the results together, starting with the old value.
+      auto ElmSz = DstTy->getScalarSizeInBits() / 8;
+      if (NumChannels > 1) {
+        Region RdR(Gather);
+        RdR.Width = RdR.NumElements = TargetWidth * NumBlks;
+        Region WrR(NewResult);
+        WrR.Width = WrR.NumElements = TargetWidth * NumBlks;
+        WrR.Mask = Args[PredIdx];
+        for (unsigned Channel = 0; Channel != NumChannels; ++Channel) {
+          RdR.Offset = ElmSz * TargetWidth * NumBlks * Channel;
+          auto Rd = RdR.createRdRegion(Gather, "joint", CI, DL);
+          WrR.Offset = 4 * (Width * NumBlks * Channel + TargetWidth * NumBlks * i);
+          NewResult = WrR.createWrRegion(NewResult, Rd, "join", CI, DL);
+        }
+      } else {
+        Region WrR(NewResult);
+        WrR.Width = WrR.NumElements = TargetWidth * NumBlks;
+        WrR.Offset = ElmSz * TargetWidth * NumBlks * i;
+        WrR.Mask = Args[PredIdx];
+        NewResult = WrR.createWrRegion(NewResult, Gather, "join", CI, DL);
+      }
+    } else {
+      assert(CI->use_empty());
+      assert(DataIdx != NONEED);
+      // Create the target-wide scatter instructions.
+      Type *Tys[] = {Args[PredIdx]->getType(), Args[AddrIdx]->getType(),
+                     Args[DataIdx]->getType()};
+      auto Decl = GenXIntrinsic::getAnyDeclaration(
+          CI->getParent()->getParent()->getParent(), IID, Tys);
+      auto NewInst = CallInst::Create(Decl, Args, "", CI);
+      NewInst->setDebugLoc(DL);
+    }
+  }
+
+  if (NewResult)
+    InstToReplace->replaceAllUsesWith(NewResult);
+
+  if (InstToReplace != CI)
+    ToErase.push_back(InstToReplace);
+  ToErase.push_back(CI);
+  return true;
+}
+
+
+/***********************************************************************
+ * processInst : process one instruction in GenXLowering
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ */
+bool GenXLowering::processInst(Instruction *Inst) {
+  if (isa<InsertElementInst>(Inst))
+    return lowerInsertElement(Inst);
+  if (isa<ExtractElementInst>(Inst))
+    return lowerExtractElement(Inst);
+  if (isa<TruncInst>(Inst))
+    return lowerTrunc(Inst);
+  if (isa<CastInst>(Inst))
+    return lowerCast(Inst);
+  if (auto SI = dyn_cast<SelectInst>(Inst)) {
+    if (SI->getType()->getScalarType()->isIntegerTy(1)) {
+      if (SI->getType() == SI->getCondition()->getType())
+        return lowerBoolVectorSelect(SI);
+      return lowerBoolScalarSelect(SI);
+    }
+    // Try lowering a non-bool select to wrregion. If lowerSelect decides
+    // not to, and it is a byte operation, widen it if necessary.
+    return lowerSelect(SI) || widenByteOp(SI);
+  }
+  if (auto SI = dyn_cast<ShuffleVectorInst>(Inst)) {
+    if (SI->getType()->getScalarType()->isIntegerTy(1))
+      return lowerBoolShuffle(SI);
+    return lowerShuffle(SI);
+  }
+  if (isa<BinaryOperator>(Inst)) {
+    if (widenByteOp(Inst))
+      return true;
+    if (Inst->getOpcode() == Instruction::AShr ||
+        Inst->getOpcode() == Instruction::LShr)
+      return lowerShr(Inst);
+    if (Inst->getOpcode() == Instruction::Mul)
+      return lowerMul64(Inst);
+    return false;
+  }
+  if (Inst->getOpcode() == Instruction::ICmp)
+    return widenByteOp(Inst);
+  else if (auto CI = dyn_cast<FCmpInst>(Inst))
+    return lowerFCmpInst(CI);
+  if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
+    if (CI->isInlineAsm())
+      return false;
+    processTwoAddressOpnd(CI);
+    unsigned IntrinsicID = GenXIntrinsic::not_any_intrinsic;
+    if (Function *Callee = CI->getCalledFunction()) {
+      IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(Callee);
+      assert(CI->getNumArgOperands() < GenXIntrinsicInfo::OPNDMASK);
+    }
+       // split gather/scatter/atomic into the width legal to the target
+    if (splitGatherScatter(CI, IntrinsicID))
+      return true;
+    switch (IntrinsicID) {
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf:
+      return lowerRdRegion(Inst);
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+      return lowerWrRegion(Inst);
+    case GenXIntrinsic::genx_rdpredregion:
+      return lowerRdPredRegion(Inst);
+    case GenXIntrinsic::genx_wrpredregion:
+      return lowerWrPredRegion(Inst);
+    case GenXIntrinsic::not_any_intrinsic:
+      break;
+    case Intrinsic::dbg_value:
+    case GenXIntrinsic::genx_absf:
+    case GenXIntrinsic::genx_absi:
+      break;
+    default:
+    case GenXIntrinsic::genx_constantpred:
+    case GenXIntrinsic::genx_constanti:
+    case GenXIntrinsic::genx_constantf:
+      break; // ignore
+    case GenXIntrinsic::genx_vload: {
+      if (!Inst->use_empty()) {
+        Value *Ptr = Inst->getOperand(0);
+        LoadInst *LI = new LoadInst(Ptr, "", /*volatile*/ true, Inst);
+        LI->takeName(Inst);
+        LI->setDebugLoc(Inst->getDebugLoc());
+        Inst->replaceAllUsesWith(LI);
+      }
+      ToErase.push_back(Inst);
+      return true;
+    }
+    case GenXIntrinsic::genx_vstore: {
+      Value *Val = Inst->getOperand(0);
+      Value *Ptr = Inst->getOperand(1);
+      auto ST = new StoreInst(Val, Ptr, /*volatile*/ true, Inst);
+      ST->setDebugLoc(Inst->getDebugLoc());
+      ToErase.push_back(Inst);
+      return true;
+    }
+    case Intrinsic::trap:
+      return lowerTrap(CI);
+    case Intrinsic::ctpop:
+      return lowerCtpop(CI);
+    case Intrinsic::uadd_with_overflow:
+      return lowerUAddWithOverflow(CI);
+    case Intrinsic::sadd_with_overflow:
+    case Intrinsic::ssub_with_overflow:
+    case Intrinsic::usub_with_overflow:
+    case Intrinsic::smul_with_overflow:
+    case Intrinsic::umul_with_overflow:
+      Inst->getContext().emitError(
+          Inst, "GenX backend cannot handle overflowing intrinsics yet");
+      break;
+    }
+    return false;
+  }
+  if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Inst))
+    return lowerExtractValue(EV);
+  if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Inst))
+    return lowerInsertValue(IV);
+  if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+    return lowerLoadStore(Inst);
+  if (isa<AllocaInst>(Inst))
+    Inst->getContext().emitError(Inst,
+                                 "GenX backend cannot handle allocas yet");
+  return false;
+}
+
+/***********************************************************************
+ * lowerRdRegion : handle read region instruction
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * 1. If index is variable do add sinking on it. (This in itself does not
+ *    cause this function to return true, because it does not cause the
+ *    original instruction to be replaced.)
+ */
+bool GenXLowering::lowerRdRegion(Instruction *Inst) {
+  // Sink add in address calculation.
+  Use *U = &Inst->getOperandUse(GenXIntrinsic::GenXRegion::RdIndexOperandNum);
+  *U = sinkAdd(*U);
+  return false;
+}
+
+/***********************************************************************
+ * lowerWrRegion : handle write region instruction
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * 1. If index is variable do add sinking on it. (This in itself does not
+ *    cause this function to return true, because it does not cause the
+ *    original instruction to be replaced.)
+ *
+ * 2. If it is a predicated byte wrregion, see if it can be widened.
+ */
+bool GenXLowering::lowerWrRegion(Instruction *Inst) {
+  // Sink add in address calculation.
+  Use *U = &Inst->getOperandUse(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+  *U = sinkAdd(*U);
+  // See if a predicated byte wrregion can be widened.
+  return widenByteOp(Inst);
+}
+
+/***********************************************************************
+ * lowerRdPredRegion : handle read predicate region instruction
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * rdpredregion is a GenX backend internal intrinsic, and was thus created
+ * within this GenXLowering pass. However it is considered legal only if its
+ * uses are all in select or wrregion or wrpredpredregion; if not we lower
+ * it further here. If a use is in rdpredregion, we need to combine the two
+ * rdpredregions into one.
+ */
+bool GenXLowering::lowerRdPredRegion(Instruction *Inst) {
+  SmallVector<CallInst *, 4> RdPredRegionUsers;
+  bool Ok = true;
+  for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+    auto User = cast<Instruction>(ui->getUser());
+    if (isa<SelectInst>(User))
+      continue;
+    unsigned IID = GenXIntrinsic::getAnyIntrinsicID(User);
+    if (GenXIntrinsic::isWrRegion(IID))
+      continue;
+    if (IID == GenXIntrinsic::genx_wrpredpredregion)
+      continue;
+    if (IID == GenXIntrinsic::genx_rdpredregion) {
+      RdPredRegionUsers.push_back(cast<CallInst>(User));
+      continue;
+    }
+    if (IID == GenXIntrinsic::not_any_intrinsic) {
+      Ok = false;
+      break;
+    }
+    if (cast<CallInst>(User)->doesNotAccessMemory()) {
+      Ok = false;
+      break;
+    }
+  }
+  unsigned Start = cast<ConstantInt>(Inst->getOperand(1))->getZExtValue();
+  unsigned Size = Inst->getType()->getVectorNumElements();
+  if (Ok) {
+    // All uses in select/wrregion/rdpredregion/non-ALU intrinsic, so we can
+    // keep the rdpredregion.  Check for uses in another rdpredregion; we need
+    // to combine those.
+    for (auto ui = RdPredRegionUsers.begin(), ue = RdPredRegionUsers.end();
+         ui != ue; ++ui) {
+      auto User = *ui;
+      unsigned UserStart =
+          cast<ConstantInt>(User->getOperand(1))->getZExtValue();
+      unsigned UserSize = User->getType()->getVectorNumElements();
+      auto Combined =
+          Region::createRdPredRegion(Inst->getOperand(0), Start + UserStart,
+                                     UserSize, "", User, User->getDebugLoc());
+      Combined->takeName(User);
+      User->replaceAllUsesWith(Combined);
+      ToErase.push_back(User);
+    }
+    return false;
+  }
+  // Need to lower it further.
+  const DebugLoc &DL = Inst->getDebugLoc();
+  // Convert input to vector of short.
+  auto In = Inst->getOperand(0);
+  Type *I16Ty = Type::getInt16Ty(Inst->getContext());
+  Type *InI16Ty = VectorType::get(I16Ty, In->getType()->getVectorNumElements());
+  auto InI16 = CastInst::Create(Instruction::ZExt, In, InI16Ty,
+                                Inst->getName() + ".lower1", Inst);
+  InI16->setDebugLoc(DL);
+  // Use rdregion to extract the region.
+  Region R(InI16);
+  R.getSubregion(Start, Size);
+  auto Rd = R.createRdRegion(InI16, Inst->getName() + ".lower3", Inst, DL);
+  // Convert back to predicate.
+  auto Res = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, Rd,
+                             Constant::getNullValue(Rd->getType()),
+                             Inst->getName() + ".lower4", Inst);
+  Res->setDebugLoc(DL);
+  // Replace uses and erase.
+  Inst->replaceAllUsesWith(Res);
+  ToErase.push_back(Inst);
+  return true;
+}
+
+/***********************************************************************
+ * lowerWrPredRegion : handle write predicate region instruction
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * wrpredregion is a GenX backend internal intrinsic, and was thus created
+ * within this GenXLowering pass. However it is considered legal only if its
+ * "new value" input is a compare; if not we lower it further here.
+ */
+bool GenXLowering::lowerWrPredRegion(Instruction *Inst) {
+  auto NewVal = Inst->getOperand(1);
+  if (isa<CmpInst>(NewVal))
+    return false;
+  // Need to lower it further.
+  const DebugLoc &DL = Inst->getDebugLoc();
+  // Convert "old value" input to vector of short.
+  auto OldVal = Inst->getOperand(0);
+  Type *I16Ty = Type::getInt16Ty(Inst->getContext());
+  Type *OldValI16Ty =
+      VectorType::get(I16Ty, OldVal->getType()->getVectorNumElements());
+  auto OldValI16 = CastInst::Create(Instruction::ZExt, OldVal, OldValI16Ty,
+                                    Inst->getName() + ".lower1", Inst);
+  OldValI16->setDebugLoc(DL);
+  // Convert "new value" input to vector of short.
+  Type *NewValI16Ty =
+      VectorType::get(I16Ty, NewVal->getType()->getVectorNumElements());
+  auto NewValI16 = CastInst::Create(Instruction::ZExt, NewVal, NewValI16Ty,
+                                    Inst->getName() + ".lower2", Inst);
+  NewValI16->setDebugLoc(DL);
+  // Use wrregion to write the new value into the old value.
+  Region R(OldValI16);
+  R.getSubregion(cast<ConstantInt>(Inst->getOperand(2))->getZExtValue(),
+                 NewValI16Ty->getVectorNumElements());
+  auto Wr = R.createWrRegion(OldValI16, NewValI16, Inst->getName() + ".lower3",
+                             Inst, DL);
+  // Convert back to predicate.
+  auto Res = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, Wr,
+                             Constant::getNullValue(Wr->getType()),
+                             Inst->getName() + ".lower4", Inst);
+  Res->setDebugLoc(DL);
+  // Replace uses and erase.
+  Inst->replaceAllUsesWith(Res);
+  ToErase.push_back(Inst);
+  return true;
+}
+
+/***********************************************************************
+ * lowerInsertElement : lower InsertElement to wrregion, multiplying the
+ *      index by the element size
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ */
+bool GenXLowering::lowerInsertElement(Instruction *Inst) {
+  Instruction *NewInst = NULL;
+  // Special case - if the result has 1 element (usually turning scalar into 1
+  // element vector) then simply transform the insert element into a bitcast We
+  // don't need to worry about the index since if it is not zero the result is
+  // undef anyway (and can be set to anything we like) We also don't need to
+  // worry about what the original vector is (usually undef) since it will be
+  // overwritten or undef
+  VectorType *VT = dyn_cast<VectorType>(Inst->getType());
+  assert(VT);
+  unsigned NumElements = VT->getNumElements();
+  const DebugLoc &DL = Inst->getDebugLoc();
+  if (NumElements == 1) {
+    Value *ToInsert = Inst->getOperand(1);
+    NewInst = CastInst::Create(Instruction::BitCast, ToInsert, VT,
+                               Inst->getName(), Inst);
+    NewInst->setDebugLoc(DL);
+  } else if (!Inst->getType()->getScalarType()->isIntegerTy(1)) {
+    // Cast and scale the index.
+    Value *IdxVal = scaleInsertExtractElementIndex(
+        Inst->getOperand(2), Inst->getOperand(1)->getType(), Inst);
+    // Sink adds in the address calculation.
+    IdxVal = sinkAdd(IdxVal);
+    // Create the new wrregion
+    Value *Src = Inst->getOperand(1);
+    Region R(Src);
+    R.Indirect = IdxVal;
+    NewInst = cast<Instruction>(R.createWrRegion(
+        Inst->getOperand(0), Src, Inst->getName(), Inst /*InsertBefore*/, DL));
+  } else {
+    // Boolean insertelement. We have to cast everything to i16, do the
+    // insertelement, and cast it back again. All this gets further lowered
+    // subsequently.
+    auto I16Ty = Type::getIntNTy(Inst->getContext(), 16);
+    auto VecTy =
+        VectorType::get(I16Ty, Inst->getType()->getVectorNumElements());
+    auto CastVec =
+        CastInst::Create(Instruction::ZExt, Inst->getOperand(0), VecTy,
+                         Inst->getOperand(0)->getName() + ".casti16", Inst);
+    CastVec->setDebugLoc(DL);
+    auto CastEl =
+        CastInst::Create(Instruction::ZExt, Inst->getOperand(1), I16Ty,
+                         Inst->getOperand(1)->getName() + ".casti16", Inst);
+    CastEl->setDebugLoc(DL);
+    auto NewInsert = InsertElementInst::Create(CastVec, CastEl,
+                                               Inst->getOperand(2), "", Inst);
+    NewInsert->takeName(Inst);
+    NewInsert->setDebugLoc(DL);
+    NewInst = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, NewInsert,
+                              Constant::getNullValue(VecTy),
+                              NewInsert->getName() + ".casti1", Inst);
+    NewInst->setDebugLoc(DL);
+  }
+  // Change uses and mark the old inst for erasing.
+  Inst->replaceAllUsesWith(NewInst);
+  ToErase.push_back(Inst);
+  return true;
+}
+
+/***********************************************************************
+ * lowerExtractElement : lower ExtractElement to rdregion, multiplying the
+ *      index by the element size
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ */
+bool GenXLowering::lowerExtractElement(Instruction *Inst) {
+  Instruction *NewInst = nullptr;
+  if (!Inst->getType()->isIntegerTy(1)) {
+    // Cast and scale the index.
+    Type *ElTy = Inst->getType();
+    Value *IdxVal =
+        scaleInsertExtractElementIndex(Inst->getOperand(1), ElTy, Inst);
+    // Sink adds in the address calculation.
+    IdxVal = sinkAdd(IdxVal);
+    // Create the new rdregion.
+    Region R(Inst);
+    R.Indirect = IdxVal;
+    NewInst = R.createRdRegion(Inst->getOperand(0), Inst->getName(),
+                               Inst /*InsertBefore*/, Inst->getDebugLoc(),
+                               true /*AllowScalar*/);
+  } else {
+    // Boolean extractelement. We have to cast everything to i16, do the
+    // extractelement, and cast it back again. All this gets further lowered
+    // subsequently.
+    auto I16Ty = Type::getIntNTy(Inst->getContext(), 16);
+    auto VecTy = VectorType::get(
+        I16Ty, Inst->getOperand(0)->getType()->getVectorNumElements());
+    auto CastVec =
+        CastInst::Create(Instruction::ZExt, Inst->getOperand(0), VecTy,
+                         Inst->getOperand(0)->getName() + ".casti16", Inst);
+    const DebugLoc &DL = Inst->getDebugLoc();
+    CastVec->setDebugLoc(DL);
+    auto NewExtract =
+        ExtractElementInst::Create(CastVec, Inst->getOperand(1), "", Inst);
+    NewExtract->takeName(Inst);
+    NewExtract->setDebugLoc(DL);
+    NewInst = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, NewExtract,
+                              Constant::getNullValue(I16Ty),
+                              NewExtract->getName() + ".casti1", Inst);
+    NewInst->setDebugLoc(DL);
+  }
+  // Change uses and mark the old inst for erasing.
+  Inst->replaceAllUsesWith(NewInst);
+  ToErase.push_back(Inst);
+  return true;
+}
+
+/***********************************************************************
+ * scaleInsertExtractElementIndex : scale index by element byte size,
+ *      and ensure it is an i16
+ */
+Value *GenXLowering::scaleInsertExtractElementIndex(Value *IdxVal, Type *ElTy,
+                                                    Instruction *InsertBefore) {
+  // Do the cast and multiply.
+  unsigned ElementBytes = ElTy->getPrimitiveSizeInBits() / 8;
+  IntegerType *I16Ty = Type::getInt16Ty(IdxVal->getContext());
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(IdxVal))
+    return ConstantInt::get(I16Ty, CI->getSExtValue() * ElementBytes);
+  // Ensure the variable offset is i16.
+  Instruction *IdxInst = CastInst::CreateIntegerCast(
+      IdxVal, I16Ty, false /*isSigned*/, "cast", InsertBefore);
+  IdxInst->setDebugLoc(InsertBefore->getDebugLoc());
+  // Multiply it by the element size in bytes.
+  if (ElementBytes != 1) {
+    IdxInst = BinaryOperator::Create(
+        Instruction::Shl, IdxInst,
+        ConstantInt::get(I16Ty, genx::log2(ElementBytes)), "scale",
+        InsertBefore);
+    IdxInst->setDebugLoc(InsertBefore->getDebugLoc());
+  }
+  return IdxInst;
+}
+
+/***********************************************************************
+ * lowerTrunc : lower a TruncInst
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * A Trunc is lowered to a bitcast then a region/element read with a stride.
+ * GenXCoalescing will coalesce the bitcast, so this will hopefully save
+ * an instruction.
+ */
+bool GenXLowering::lowerTrunc(Instruction *Inst) {
+  Value *InValue = Inst->getOperand(0);
+  // Check for the trunc's input being a sext/zext where the original element
+  // size is the same as the result of the trunc. We can just remove the
+  // whole thing then. (This can arise from GenXReduceIntSize.)
+  if (auto CI = dyn_cast<CastInst>(InValue)) {
+    if ((isa<SExtInst>(CI) || isa<ZExtInst>(CI)) &&
+        CI->getOperand(0)->getType() == Inst->getType()) {
+      // Just replace uses with the original unextended value.
+      Inst->replaceAllUsesWith(CI->getOperand(0));
+      ToErase.push_back(Inst);
+      return true;
+    }
+  }
+
+  // Lower "trunc i8 %v to i1" into "cmp.ne (%v & 1), 0"
+  if (Inst->getType()->isIntOrIntVectorTy(1)) {
+    IRBuilder<> Builder(Inst);
+    auto V =
+        Builder.CreateAnd(InValue, ConstantInt::get(InValue->getType(), 1));
+    V = Builder.CreateICmpNE(V, ConstantInt::get(V->getType(), 0));
+    if (auto I = dyn_cast<Instruction>(V))
+      I->setDebugLoc(Inst->getDebugLoc());
+    Inst->replaceAllUsesWith(V);
+    ToErase.push_back(Inst);
+    return true;
+  }
+
+  Type *InElementTy = InValue->getType();
+  Type *OutElementTy = Inst->getType();
+  unsigned NumElements = 1;
+  if (VectorType *VT = dyn_cast<VectorType>(InElementTy)) {
+    InElementTy = VT->getElementType();
+    OutElementTy = cast<VectorType>(OutElementTy)->getElementType();
+    NumElements = VT->getNumElements();
+  }
+
+  // Lower "trunc <32 x i16> %v to <32 x i1>" into "cmp.ne (%v & 1), 0"
+  if (NumElements > 1 && OutElementTy->isIntegerTy(1)) {
+    IRBuilder<> Builder(Inst);
+    unsigned N = NumElements;
+    Value *Os = ConstantVector::getSplat(N, ConstantInt::get(InElementTy, 1));
+    Value *Zs = ConstantVector::getSplat(N, ConstantInt::get(InElementTy, 0));
+    auto V = Builder.CreateAnd(InValue, Os);
+    if (auto I = dyn_cast<Instruction>(V))
+      I->setDebugLoc(Inst->getDebugLoc());
+    V = Builder.CreateICmpNE(V, Zs);
+    if (auto I = dyn_cast<Instruction>(V))
+      I->setDebugLoc(Inst->getDebugLoc());
+    Inst->replaceAllUsesWith(V);
+    ToErase.push_back(Inst);
+    return true;
+  }
+
+  assert(OutElementTy->getPrimitiveSizeInBits());
+  unsigned Stride = InElementTy->getPrimitiveSizeInBits() /
+                    OutElementTy->getPrimitiveSizeInBits();
+  // Create the new bitcast.
+  Instruction *BC =
+      CastInst::Create(Instruction::BitCast, InValue,
+                       VectorType::get(OutElementTy, Stride * NumElements),
+                       Inst->getName(), Inst /*InsertBefore*/);
+  BC->setDebugLoc(Inst->getDebugLoc());
+  // Create the new rdregion.
+  Region R(BC);
+  R.NumElements = NumElements;
+  R.Stride = Stride;
+  R.Width = NumElements;
+  R.VStride = R.Stride * R.Width;
+  Instruction *NewInst = R.createRdRegion(
+      BC, Inst->getName(), Inst /*InsertBefore*/, Inst->getDebugLoc(),
+      !isa<VectorType>(Inst->getType()) /*AllowScalar*/);
+  // Change uses and mark the old inst for erasing.
+  Inst->replaceAllUsesWith(NewInst);
+  ToErase.push_back(Inst);
+  return true;
+}
+
+/***********************************************************************
+ * lowerCast : lower a CastInst
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ */
+bool GenXLowering::lowerCast(Instruction *Inst) {
+  // If it is zext/sext/UIToFP from (vector of) i1, turn into a select.
+  if (Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1) &&
+      Inst->getOpcode() != Instruction::BitCast) {
+    int OneVal = 0;
+    switch (Inst->getOpcode()) {
+    case Instruction::ZExt:
+      OneVal = 1;
+      break;
+    case Instruction::SExt:
+      OneVal = -1;
+      break;
+    case Instruction::UIToFP:
+      OneVal = 1;
+      break;
+    default:
+      assert(0 && "unknown opcode in lowerCast");
+    }
+
+    Instruction *NewInst;
+    if (Inst->getType()->isFPOrFPVectorTy())
+      NewInst = SelectInst::Create(
+          Inst->getOperand(0), ConstantFP::get(Inst->getType(), OneVal),
+          ConstantFP::get(Inst->getType(), 0), Inst->getName(), Inst);
+    else
+      NewInst = SelectInst::Create(
+          Inst->getOperand(0), ConstantInt::get(Inst->getType(), OneVal),
+          ConstantInt::get(Inst->getType(), 0), Inst->getName(), Inst);
+    NewInst->setDebugLoc(Inst->getDebugLoc());
+    Inst->replaceAllUsesWith(NewInst);
+    ToErase.push_back(Inst);
+    return true;
+  }
+  return false;
+}
+
+/***********************************************************************
+ * lowerSelect : lower a non-i1 select
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ *          Lower select into predicated  wrr. This transform is profitable
+ *          if we can bale into resulting wrr later
+ */
+bool GenXLowering::lowerSelect(SelectInst *SI) {
+  assert(SI);
+
+  if (!isa<VectorType>(SI->getOperand(0)->getType()))
+    return false; // scalar selector
+
+  // Do not lower byte select, because byte wrr then will be widened
+  if (SI->getTrueValue()->getType()->getScalarType()->isIntegerTy(8))
+    return false;
+
+  Value *Cond = SI->getCondition();
+  Value *TrueVal = SI->getTrueValue();
+  Value *FalseVal = SI->getFalseValue();
+
+  // Do not transform if one of the sources is constant.
+  // Now post-legalization generarates redundant moves for constants.
+  // It's also required for correct baling of function pointers' PtrToInts
+  // into select.
+  // This check can be relaxed.
+  if (isa<Constant>(TrueVal) || isa<Constant>(FalseVal))
+    return false;
+
+  // If select is used by unmasked wrr than we do not apply transformation too
+  // because wrr+wrr is not optimal. In this case select itself will bale into
+  // wrr. There might be some cases where wrr user of
+  // select can be eliminated too.
+  if (SI->hasOneUse() && GenXIntrinsic::isWrRegion(SI->user_back())) {
+    auto *I = cast<Instruction>(SI->user_back());
+    if ((I->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum) == SI) &&
+        !Region(I, BaleInfo()).Mask)
+      return false;
+  }
+
+  // GenXPatternMatch tries to convert cmp + select
+  // into min/max instructions. So do not transform in this case
+  // This check can be relaxed too.
+  if (isa<CmpInst>(Cond))
+    return false;
+
+  bool TrueValUsedOnce = TrueVal->hasOneUse();
+  bool FalseValUsedOnce = FalseVal->hasOneUse();
+
+  // Baling produces better code if balable
+  // value has single use
+  if (!FalseValUsedOnce && !TrueValUsedOnce)
+    return false;
+
+  // So select this value
+  bool InvertPred = false;
+  Value *OldWrrVal = FalseVal;
+  Value *NewWrrVal = TrueVal;
+  if (GotoJoin::isEMValue(Cond) && !TrueValUsedOnce) {
+    // Conversion only for true val if EM since
+    // EM is implicit, inverting it will require extra instructions
+    return false;
+  }
+
+  if (FalseValUsedOnce && !TrueValUsedOnce) {
+    std::swap(OldWrrVal, NewWrrVal);
+    InvertPred = true;
+  }
+
+  // Main check: profitable only if we can bale later
+  Region R(SI);
+  R.Mask = Cond;
+  if (!GenXBaling::isBalableNewValueIntoWrr(NewWrrVal, R, ST))
+    return false;
+
+  // Inverting predicate if false value of select was choosen
+  // as new value for wrr
+  if (InvertPred) {
+    R.Mask = BinaryOperator::Create(
+        Instruction::Xor, R.Mask, Constant::getAllOnesValue(R.Mask->getType()),
+        SI->getName() + ".invertpred", SI);
+    cast<Instruction>(R.Mask)->setDebugLoc(SI->getDebugLoc());
+  }
+
+  auto NewWrRegion = cast<Instruction>(R.createWrRegion(
+      OldWrrVal, NewWrrVal, SI->getName() + ".lower", SI, SI->getDebugLoc()));
+  SI->replaceAllUsesWith(NewWrRegion);
+  ToErase.push_back(SI);
+  return true;
+}
+
+/***********************************************************************
+ * lowerBoolScalarSelect : lower a SelectInst on vector of i1
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * This is a select on vector of i1 where the condition is scalar. This only
+ * happens in simd control flow where an LLVM pass has optimized away the
+ * conditional branch. We restore the conditional branch and create an
+ * if..else..endif.
+ */
+bool GenXLowering::lowerBoolScalarSelect(SelectInst *SI) {
+  //         BB1
+  //        /  |
+  // false /   | true
+  //      /    |
+  //   BB2     |
+  //      \    |
+  //       \   |
+  //        \  |
+  //         BB4
+  //
+  auto BB1 = SI->getParent();
+  auto BB2 = SplitBlock(BB1, SI, DT);
+  auto BB4 = SplitEdge(BB1, BB2, DT);
+  BB2->setName("select.false");
+  BB4->setName("select.true");
+
+  auto OldTerm = BB1->getTerminator();
+  BranchInst::Create(BB4, BB2, SI->getCondition(), OldTerm);
+  OldTerm->eraseFromParent();
+  // Since additional edge is added between BB1 and BB4 instead of through BB2
+  // only. BB4 is not immediately dominated by BB2 anymore. Instead, BB4 is
+  // dominated by BB1 immediately.
+  if (DT)
+    DT->changeImmediateDominator(BB4, BB1);
+  // Replace 'select' with 'phi'
+  auto Phi = PHINode::Create(SI->getType(), /*NumReservedValues=*/2, "",
+                             &BB4->front());
+  Phi->takeName(SI);
+  Phi->addIncoming(SI->getTrueValue(), BB1);
+  Phi->addIncoming(SI->getFalseValue(), BB2);
+  SI->replaceAllUsesWith(Phi);
+  ToErase.push_back(SI);
+  // Split the (critical) edge from BB1 to BB4 to avoid having critical edge.
+  auto BB3 = SplitEdge(BB1, BB4, DT);
+  BB3->setName("select.crit");
+  return true;
+}
+
+/***********************************************************************
+ * lowerBoolVectorSelect : lower a SelectInst on (vector of) i1
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * A select on (vector of) i1 is lowered to the equivalent and/or/xor
+ * instructions. No simplification is done even if an input is a constant.
+ *
+ * However, if the selector looks like an EM value, and the "true" operand is
+ * a cmp, it is instead lowered to an llvm.genx.wrpredpredregion. Baling will
+ * bale the cmp into it, resulting in a masked cmp instruction that sets bits
+ * of the flag only if the corresponding EM bit is set.
+ *
+ * FIXME: I have seen a case where the two inputs are all false and all true.
+ * Rather than try and simplify that here in the GenX backend, we should
+ * try and work out how to stop LLVM generating it in the first place.
+ */
+bool GenXLowering::lowerBoolVectorSelect(SelectInst *Inst) {
+  if (isa<CmpInst>(Inst->getTrueValue())) {
+    // Check for the condition being an EM value. It might be a shufflevector
+    // that slices the EM value at index 0.
+    bool IsEM = GotoJoin::isEMValue(Inst->getCondition());
+    if (!IsEM) {
+      if (auto SV = dyn_cast<ShuffleVectorInst>(Inst->getCondition())) {
+        ShuffleVectorAnalyzer SVA(SV);
+        if (!SVA.getAsSlice()) {
+          // Slice at index 0.
+          IsEM = GotoJoin::isEMValue(SV->getOperand(0));
+        }
+      }
+    }
+    if (IsEM) {
+      // Can be lowered to llvm.genx.wrpredpredregion. It always has an index of
+      // 0 and the "new value" operand the same vector width as the whole vector
+      // here. That might get changed if it is split up in legalization.
+      auto NewInst = Region::createWrPredPredRegion(
+          Inst->getFalseValue(), Inst->getTrueValue(), 0, Inst->getCondition(),
+          "", Inst, Inst->getDebugLoc());
+      NewInst->takeName(Inst);
+      Inst->replaceAllUsesWith(NewInst);
+      ToErase.push_back(Inst);
+      return true;
+    }
+  }
+  // Normal lowering to some bit twiddling.
+  Instruction *NewInst1 =
+      BinaryOperator::Create(BinaryOperator::And, Inst->getOperand(0),
+                             Inst->getOperand(1), Inst->getName(), Inst);
+  NewInst1->setDebugLoc(Inst->getDebugLoc());
+  Instruction *NewInst2 = BinaryOperator::Create(
+      BinaryOperator::Xor, Inst->getOperand(0),
+      Constant::getAllOnesValue(Inst->getType()), Inst->getName(), Inst);
+  NewInst2->setDebugLoc(Inst->getDebugLoc());
+  Instruction *NewInst3 =
+      BinaryOperator::Create(BinaryOperator::And, Inst->getOperand(2), NewInst2,
+                             Inst->getName(), Inst);
+  NewInst3->setDebugLoc(Inst->getDebugLoc());
+  Instruction *NewInst4 = BinaryOperator::Create(
+      BinaryOperator::Or, NewInst1, NewInst3, Inst->getName(), Inst);
+  NewInst4->setDebugLoc(Inst->getDebugLoc());
+  Inst->replaceAllUsesWith(NewInst4);
+  ToErase.push_back(Inst);
+  return true;
+}
+
+/***********************************************************************
+ * lowerBoolShuffle : lower a shufflevector (element type i1)
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * We handle three cases:
+ *
+ * 1. A slice of the vector, which can be turned into rdpredregion.
+ *
+ * 2. A splat. By default we need to lower that to a select to
+ *    0 or -1 then a bitcast to the vector of i1. But if the input is the
+ *    result of a cmp then we can splat the cmp as an optimization.
+ *
+ * 3. An unslice of the vector, which can be turned into wrpredregion.
+ */
+bool GenXLowering::lowerBoolShuffle(ShuffleVectorInst *SI) {
+  ShuffleVectorAnalyzer SVA(SI);
+  // 1. Check for a slice.
+  int SliceStart = SVA.getAsSlice();
+  if (SliceStart >= 0) {
+    unsigned Width = SI->getType()->getVectorNumElements();
+    auto RPR = Region::createRdPredRegion(SI->getOperand(0), SliceStart, Width,
+                                          "", SI, SI->getDebugLoc());
+    RPR->takeName(SI);
+    SI->replaceAllUsesWith(RPR);
+    ToErase.push_back(SI);
+    return true;
+  }
+  // 2. Check for a splat.
+  auto Splat = SVA.getAsSplat();
+  if (Splat.Input)
+    return lowerBoolSplat(SI, Splat.Input, Splat.Index);
+  // 3. Check for an unslice. The "old value" input is operand 0 of the
+  // shufflevector; the "new value" input is operand 0 of the shufflevector
+  // that is operand 1 of SI. We create a wrpredregion, but GenXLowering might
+  // subsequently decide that it is illegal because its "new value" input is not
+  // a compare, in which case it is further lowered.
+  int UnsliceStart = SVA.getAsUnslice();
+  if (UnsliceStart >= 0) {
+    auto InnerSI = cast<ShuffleVectorInst>(SI->getOperand(1));
+    auto WPR =
+        Region::createWrPredRegion(SI->getOperand(0), InnerSI->getOperand(0),
+                                   UnsliceStart, "", SI, SI->getDebugLoc());
+    WPR->takeName(SI);
+    SI->replaceAllUsesWith(WPR);
+    // Undef out the operand for InnerSI in SI, so we can directly erase InnerSI
+    // if SI was its only use.
+    SI->setOperand(1, UndefValue::get(InnerSI->getType()));
+    ToErase.push_back(SI);
+    if (InnerSI->use_empty())
+      InnerSI->eraseFromParent();
+    return true;
+  }
+
+  // Do not lower replicated slices.
+  if (SVA.isReplicatedSlice())
+    return false;
+
+  // No other cases handled.
+  SI->getContext().emitError(
+      SI, "general bool shuffle vector instruction not implemented");
+  return false;
+}
+
+/***********************************************************************
+ * lowerBoolSplat : lower a shufflevector (element type i1) that is a splat
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ */
+bool GenXLowering::lowerBoolSplat(ShuffleVectorInst *SI, Value *In,
+                                  unsigned Idx) {
+  unsigned Width = SI->getType()->getVectorNumElements();
+  if (isa<VectorType>(In->getType())) {
+    IRBuilder<> B(SI);
+    Constant *C1 = ConstantVector::getSplat(Width, B.getInt16(1));
+    Constant *C0 = ConstantVector::getSplat(Width, B.getInt16(0));
+    Value *V = B.CreateSelect(In, C1, C0);
+    Region R(V);
+    R.NumElements = Width;
+    R.Stride = 0;
+    R.VStride = 0;
+    R.Offset = (int)Idx;
+    V = R.createRdRegion(V, "splat", SI, SI->getDebugLoc());
+    V = B.CreateICmpNE(V, C0);
+    SI->replaceAllUsesWith(V);
+    ToErase.push_back(SI);
+    return true;
+  }
+  // This is a splat. See if the input is a cmp, possibly via a bitcast.
+  if (auto BC = dyn_cast<BitCastInst>(In))
+    In = BC->getOperand(0);
+  if (auto Cmp = dyn_cast<CmpInst>(In)) {
+    // Create a splatted version of the cmp.
+    Value *CmpOpnds[2];
+    Region R(Cmp->getOperand(0));
+    R.NumElements = Width;
+    R.Width = R.NumElements;
+    R.Stride = 0;
+    R.VStride = 0;
+    for (unsigned i = 0; i != 2; ++i) {
+      auto Opnd = Cmp->getOperand(i);
+      if (auto C = dyn_cast<Constant>(Opnd)) {
+        CmpOpnds[i] = ConstantVector::getSplat(R.NumElements, C);
+        continue;
+      }
+      if (!isa<VectorType>(Opnd->getType())) {
+        auto NewBC = CastInst::Create(Instruction::BitCast, Opnd,
+                                      VectorType::get(Opnd->getType(), 1),
+                                      Opnd->getName() + ".bc", Cmp);
+        NewBC->setDebugLoc(Cmp->getDebugLoc());
+        Opnd = NewBC;
+      }
+      CmpOpnds[i] =
+          R.createRdRegion(Opnd, Cmp->getOperand(i)->getName() + ".splat",
+                           Cmp /*InsertBefore*/, Cmp->getDebugLoc());
+    }
+    auto NewCmp = CmpInst::Create(
+        Cmp->getOpcode(), Cmp->getPredicate(), CmpOpnds[0], CmpOpnds[1],
+        Cmp->getName() + ".splat", Cmp /*InsertBefore*/);
+    NewCmp->setDebugLoc(Cmp->getDebugLoc());
+    SI->replaceAllUsesWith(NewCmp);
+    ToErase.push_back(SI);
+    return true;
+  }
+  // Default code. Select int and bitcast to vector of i1.
+  if (isa<VectorType>(In->getType())) {
+    // First convert v1i1 to i1.
+    auto NewBC = CastInst::Create(Instruction::BitCast, In,
+                                  In->getType()->getScalarType(),
+                                  In->getName() + ".scalar", SI);
+    NewBC->setDebugLoc(SI->getDebugLoc());
+    In = NewBC;
+  }
+  if (Width == 8 || Width == 16 || Width == 32) {
+    auto IntTy = Type::getIntNTy(SI->getContext(), Width);
+    auto Sel = SelectInst::Create(In, Constant::getAllOnesValue(IntTy),
+                                  Constant::getNullValue(IntTy),
+                                  SI->getName() + ".sel", SI);
+    Sel->setDebugLoc(SI->getDebugLoc());
+    auto NewBC =
+        CastInst::Create(Instruction::BitCast, Sel, SI->getType(), "", SI);
+    NewBC->takeName(SI);
+    NewBC->setDebugLoc(SI->getDebugLoc());
+    SI->replaceAllUsesWith(NewBC);
+    ToErase.push_back(SI);
+    return true;
+  }
+
+  IRBuilder<> Builder(SI);
+  auto Val = Builder.CreateSelect(In, Builder.getInt16(1), Builder.getInt16(0),
+                                  SI->getName() + ".sel");
+  if (auto Inst = dyn_cast<Instruction>(Val))
+    Inst->setDebugLoc(SI->getDebugLoc());
+  Val = Builder.CreateBitCast(Val, VectorType::get(Builder.getInt16Ty(), 1));
+  if (auto Inst = dyn_cast<Instruction>(Val))
+    Inst->setDebugLoc(SI->getDebugLoc());
+
+  Region R(Val);
+  R.Offset = 0;
+  R.Width = 1;
+  R.Stride = R.VStride = 0;
+  R.NumElements = Width;
+  Val = R.createRdRegion(Val, "", SI, SI->getDebugLoc());
+  Val = Builder.CreateICmpNE(Val, ConstantVector::getNullValue(Val->getType()));
+  Val->takeName(SI);
+  if (auto Inst = dyn_cast<Instruction>(Val))
+    Inst->setDebugLoc(SI->getDebugLoc());
+  SI->replaceAllUsesWith(Val);
+  ToErase.push_back(SI);
+  return true;
+}
+
+/***********************************************************************
+ * lowerShuffleSplat : lower a ShuffleInst (element type not i1) when it is
+ *                     a splat (repetition of the same element)
+ */
+void GenXLowering::lowerShuffleSplat(ShuffleVectorInst *SI,
+                                     ShuffleVectorAnalyzer::SplatInfo Splat) {
+  // This is a splat. Turn it into a splatting rdregion.
+  if (!isa<VectorType>(Splat.Input->getType())) {
+    // The input is a scalar rather than a 1-vector. Bitcast it to a 1-vector.
+    auto *BC = CastInst::Create(Instruction::BitCast, Splat.Input,
+                                VectorType::get(Splat.Input->getType(), 1),
+                                SI->getName(), SI);
+    BC->setDebugLoc(SI->getDebugLoc());
+    Splat.Input = BC;
+  }
+  // Create a rdregion with a stride of 0 to represent this splat
+  Region R(Splat.Input);
+  R.NumElements = SI->getType()->getVectorNumElements();
+  R.Width = R.NumElements;
+  R.Stride = 0;
+  R.VStride = 0;
+  R.Offset = Splat.Index * R.ElementBytes;
+  Instruction *NewInst =
+      R.createRdRegion(Splat.Input, "", SI /*InsertBefore*/, SI->getDebugLoc());
+  NewInst->takeName(SI);
+  NewInst->setDebugLoc(SI->getDebugLoc());
+  SI->replaceAllUsesWith(NewInst);
+  ToErase.push_back(SI);
+}
+
+/***********************************************************************
+ * lowerShuffle : lower a ShuffleInst (element type not i1)
+ *
+ * Mostly these are splats. These are lowered to a rdregion
+ * Any other shuffle is currently unsupported
+ */
+bool GenXLowering::lowerShuffle(ShuffleVectorInst *SI) {
+  auto Splat = ShuffleVectorAnalyzer(SI).getAsSplat();
+  if (Splat.Input) {
+    lowerShuffleSplat(SI, Splat);
+    return true;
+  }
+  if (lowerShuffleToSelect(SI))
+    return true;
+  lowerShuffleToMove(SI);
+  return true;
+}
+
+// Lower those shufflevector that can be implemented efficiently as select.
+bool GenXLowering::lowerShuffleToSelect(ShuffleVectorInst *SI) {
+  int NumElements = SI->getType()->getVectorNumElements();
+  int NumOpnd = SI->getNumOperands();
+  for (int i = 0; i < NumOpnd; ++i) {
+    if (SI->getOperand(i)->getType()->getVectorNumElements() != NumElements)
+      return false;
+  }
+  for (int i = 0; i < NumElements; ++i) {
+    int idx = SI->getMaskValue(i);
+    // undef index returns -1.
+    if (idx < 0)
+      continue;
+    if (idx != i && idx != i + NumElements)
+      return false;
+  }
+  IRBuilder<> Builder(SI);
+  Type *Int1Ty = Builder.getInt1Ty();
+  SmallVector<Constant *, 16> MaskVec;
+  MaskVec.reserve(NumElements);
+  for (int i = 0; i < NumElements; ++i) {
+    int idx = SI->getMaskValue(i);
+    // undef index returns -1.
+    if (idx == i || idx < 0)
+      MaskVec.push_back(ConstantInt::get(Int1Ty, 1));
+    else
+      MaskVec.push_back(ConstantInt::get(Int1Ty, 0));
+  }
+  Value *Mask = ConstantVector::get(MaskVec);
+  auto NewSel =
+      SelectInst::Create(Mask, SI->getOperand(0), SI->getOperand(1), "", SI);
+  NewSel->takeName(SI);
+  NewSel->setDebugLoc(SI->getDebugLoc());
+  SI->replaceAllUsesWith(NewSel);
+  ToErase.push_back(SI);
+  return true;
+}
+
+template <typename Iter> Iter skipUndefs(Iter First, Iter Last) {
+  return std::find_if(First, Last, [](int MaskVal) { return MaskVal != -1; });
+}
+
+/***********************************************************************
+ * lowerShuffleToMove : lower a ShuffleInst (element type not i1) to a
+ *                      sequence of rd/wrregion intrinsics
+ */
+void GenXLowering::lowerShuffleToMove(ShuffleVectorInst *SI) {
+  ShuffleVectorAnalyzer Analyzer(SI);
+  std::vector<ShuffleVectorAnalyzer::OperandRegionInfo> RdRegions;
+  std::vector<Region> WrRegions;
+  auto MaskVals = SI->getShuffleMask();
+
+  // Filling read and write regions based on shuffle mask.
+  for (auto It = skipUndefs(MaskVals.begin(), MaskVals.end());
+       It != MaskVals.end();
+       It = skipUndefs(std::next(It, RdRegions.back().R.NumElements),
+                       MaskVals.end())) {
+    int Idx = It - MaskVals.begin();
+    auto OpRegion = Analyzer.getMaskRegionPrefix(Idx);
+    assert(OpRegion.R.NumElements > 0 &&
+           "should've match at least 1 element region");
+    Region WrRegion(SI);
+    WrRegion.Offset = Idx * WrRegion.ElementBytes;
+    WrRegion.NumElements = WrRegion.Width = OpRegion.R.NumElements;
+    RdRegions.push_back(std::move(OpRegion));
+    WrRegions.push_back(std::move(WrRegion));
+  }
+
+  // Building rdregion intrinsics or promoting the operand if possible.
+  std::vector<Value *> RdRegionInsts;
+  std::transform(
+      RdRegions.begin(), RdRegions.end(), std::back_inserter(RdRegionInsts),
+      [SI](ShuffleVectorAnalyzer::OperandRegionInfo &OpRegion) -> Value * {
+        if (OpRegion.Op->getType()->getVectorNumElements() ==
+            OpRegion.R.NumElements)
+          return OpRegion.Op;
+        return OpRegion.R.createRdRegion(
+            OpRegion.Op, SI->getName() + ".shuffle.rd", SI, SI->getDebugLoc());
+      });
+
+  // Obtaining SI replacement (sequence of wrregion intrinsics in the
+  // most common case).
+  Value *Result;
+  if (WrRegions.size() == 0)
+    Result = UndefValue::get(SI->getType());
+  else if (WrRegions.size() == 1 && WrRegions.front().NumElements ==
+                                        SI->getType()->getVectorNumElements())
+    Result = RdRegionInsts.back();
+  else {
+    auto WrRegionArgs = zip(WrRegions, RdRegionInsts);
+    Result = std::accumulate(
+        WrRegionArgs.begin(), WrRegionArgs.end(),
+        static_cast<Value *>(UndefValue::get(SI->getType())),
+        [SI](Value *PrevWrRegionInst,
+             const std::tuple<Region &, Value *> &Args) {
+          return std::get<0>(Args).createWrRegion(
+              PrevWrRegionInst, std::get<1>(Args),
+              SI->getName() + ".shuffle.wr", SI, SI->getDebugLoc());
+        });
+  }
+
+  SI->replaceAllUsesWith(Result);
+  ToErase.push_back(SI);
+}
+
+/***********************************************************************
+ * lowerShr : lower Shl followed by AShr/LShr by the same amount
+ *    into trunc+sext/zext
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * See convertShlShr below.
+ */
+bool GenXLowering::lowerShr(Instruction *Inst) {
+  Instruction *NewInst = convertShlShr(Inst);
+  if (!NewInst)
+    return false; // no conversion done
+  ToErase.push_back(Inst);
+  auto Shl = cast<Instruction>(Inst->getOperand(0));
+  if (Shl->hasOneUse())
+    ToErase.push_back(Shl);
+  return true;
+}
+
+/***********************************************************************
+ * convertShlShr : convert Shl followed by AShr/LShr by the same amount
+ *    into trunc+sext/zext
+ *
+ * Enter:   Inst = the AShr or LShr instruction
+ *
+ * Return:  0 if no conversion done, else the new SExt/ZExt instruction.
+ *          The original AShr/LShr is now unused, but neither original
+ *          instruction is erased.
+ *
+ * This is the opposite to what instruction combining does! We want to change
+ * it back to trunc then extend because the trunc can then be lowered into
+ * a region, and the extend can sometimes be baled into whatever uses it.
+ *
+ * This is a separately callable global function so it can also be used
+ * from GenXReduceIntSize, which for other reasons of convenience runs before
+ * GenXLowering.
+ */
+Instruction *llvm::genx::convertShlShr(Instruction *Inst) {
+  unsigned NumBits = Inst->getType()->getScalarType()->getPrimitiveSizeInBits();
+  auto C = dyn_cast<Constant>(Inst->getOperand(1));
+  if (!C)
+    return nullptr;
+  auto Shl = dyn_cast<Instruction>(Inst->getOperand(0));
+  if (!Shl)
+    return nullptr;
+  if (Shl->getOpcode() != Instruction::Shl)
+    return nullptr;
+  if (Shl->getOperand(1) != C)
+    return nullptr;
+  if (isa<VectorType>(C->getType())) {
+    C = C->getSplatValue();
+    if (!C)
+      return nullptr;
+  }
+  unsigned ShiftBits = cast<ConstantInt>(C)->getSExtValue();
+  unsigned RemainingBits = NumBits - ShiftBits;
+  if (RemainingBits != 8 && RemainingBits != 16)
+    return nullptr;
+  // We have Shl+AShr or Shl+LShr that can be turned into trunc+sext/zext.
+  Type *ConvTy = Type::getIntNTy(Inst->getContext(), RemainingBits);
+  if (auto VT = dyn_cast<VectorType>(Inst->getType()))
+    ConvTy = VectorType::get(ConvTy, VT->getNumElements());
+  auto Trunc = CastInst::Create(Instruction::Trunc, Shl->getOperand(0), ConvTy,
+                                "", Inst);
+  Trunc->takeName(Shl);
+  Trunc->setDebugLoc(Shl->getDebugLoc());
+  auto Ext = CastInst::Create(Inst->getOpcode() == Instruction::AShr
+                                  ? Instruction::SExt
+                                  : Instruction::ZExt,
+                              Trunc, Inst->getType(), "", Inst);
+  Ext->takeName(Inst);
+  Ext->setDebugLoc(Inst->getDebugLoc());
+  Inst->replaceAllUsesWith(Ext);
+  return Ext;
+}
+
+/***********************************************************************
+ * splitStructPhis : find struct phi nodes and split them
+ *
+ * Return:  whether code modified
+ *
+ * Each struct phi node is split into a separate phi node for each struct
+ * element. This is needed because the GenX backend's liveness and coalescing
+ * code cannot cope with a struct phi.
+ *
+ * This is run in two places: firstly in GenXLowering, so that pass can then
+ * simplify any InsertElement and ExtractElement instructions added by the
+ * struct phi splitting. But then it needs to be run again in GenXLiveness,
+ * because other passes can re-insert a struct phi. The case I saw in
+ * hevc_speed was something commoning up the struct return from two calls in an
+ * if..else..endif.
+ */
+bool genx::splitStructPhis(Function *F) {
+  bool Modified = false;
+  for (Function::iterator fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+    BasicBlock *BB = &*fi;
+    for (BasicBlock::iterator bi = BB->begin();;) {
+      PHINode *Phi = dyn_cast<PHINode>(&*bi);
+      if (!Phi)
+        break;
+      ++bi; // increment here as splitStructPhi removes old phi node
+      if (isa<StructType>(Phi->getType()))
+        Modified |= GenXLowering::splitStructPhi(Phi);
+    }
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * splitStructPhi : split a phi node with struct type by splitting into
+ *                  struct elements
+ */
+bool GenXLowering::splitStructPhi(PHINode *Phi) {
+  StructType *Ty = cast<StructType>(Phi->getType());
+  // Find where we need to insert the combine instructions.
+  Instruction *CombineInsertBefore = Phi->getParent()->getFirstNonPHI();
+  // Now split the phi.
+  Value *Combined = UndefValue::get(Ty);
+  // For each struct element...
+  for (unsigned Idx = 0, e = Ty->getNumElements(); Idx != e; ++Idx) {
+    Type *ElTy = Ty->getTypeAtIndex(Idx);
+    // Create the new phi node.
+    PHINode *NewPhi =
+        PHINode::Create(ElTy, Phi->getNumIncomingValues(),
+                        Phi->getName() + ".element" + Twine(Idx), Phi);
+    NewPhi->setDebugLoc(Phi->getDebugLoc());
+    // Combine the new phi.
+    Instruction *Combine = InsertValueInst::Create(
+        Combined, NewPhi, Idx, NewPhi->getName(), CombineInsertBefore);
+    Combine->setDebugLoc(Phi->getDebugLoc());
+    Combined = Combine;
+    // For each incoming...
+    for (unsigned In = 0, InEnd = Phi->getNumIncomingValues(); In != InEnd;
+         ++In) {
+      // Create an extractelement to get the individual element value.
+      // This needs to go before the terminator of the incoming block.
+      BasicBlock *IncomingBB = Phi->getIncomingBlock(In);
+      Value *Incoming = Phi->getIncomingValue(In);
+      Instruction *Extract = ExtractValueInst::Create(
+          Incoming, Idx, Phi->getName() + ".element" + Twine(Idx),
+          IncomingBB->getTerminator());
+      Extract->setDebugLoc(Phi->getDebugLoc());
+      // Add as an incoming of the new phi node.
+      NewPhi->addIncoming(Extract, IncomingBB);
+    }
+  }
+  Phi->replaceAllUsesWith(Combined);
+  Phi->eraseFromParent();
+  return true;
+}
+
+/***********************************************************************
+ * lowerExtractValue : remove extractvalue if possible
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * If we can trace the input of the extractvalue to the point where the
+ * value was inserted, use that value instead.
+ *
+ * Because we have already split struct phi nodes, we should just be left
+ * with insertvalue/extractvalue pairs that we can remove here. The
+ * exception is when a struct is passed in to or returned from a call.
+ * Then we leave the extractvalue for later handling in the register
+ * allocator.
+ */
+bool GenXLowering::lowerExtractValue(ExtractValueInst *Inst) {
+  ArrayRef<unsigned> EVIndices = Inst->getIndices();
+  ArrayRef<unsigned> Indices = EVIndices;
+  Value *V = Inst->getAggregateOperand();
+  for (;;) {
+    InsertValueInst *IV = dyn_cast<InsertValueInst>(V);
+    if (!IV) {
+      // If we used up any indices, create a new extractvalue for the
+      // remaining ones.
+      if (Indices.size() != EVIndices.size()) {
+        Instruction *NewIV = ExtractValueInst::Create(
+            Inst->getAggregateOperand(), Indices, Inst->getName(), Inst);
+        NewIV->setDebugLoc(Inst->getDebugLoc());
+        Inst->replaceAllUsesWith(NewIV);
+        ToErase.push_back(Inst);
+        return true;
+      }
+      return false;
+    }
+    // We have an insertvalue. See how many of the indices agree.
+    ArrayRef<unsigned> IVIndices = IV->getIndices();
+    unsigned Match = 0;
+    while (Match < Indices.size() && Match < IVIndices.size() &&
+           Indices[Match] == IVIndices[Match])
+      ++Match;
+    if (!Match) {
+      // No match at all. Go back to the previous insertvalue.
+      V = IV->getAggregateOperand();
+      continue;
+    }
+    // Use the inserted value here.
+    V = IV->getInsertedValueOperand();
+    // Chop off the indices we have used up. If none left, we have finished.
+    Indices = Indices.slice(Match);
+    if (!Indices.size())
+      break;
+  }
+  // We have found the struct element value V.
+  Inst->replaceAllUsesWith(V);
+  ToErase.push_back(Inst);
+  return true;
+}
+
+/***********************************************************************
+ * lowerInsertValue : remove insertvalue if possible
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * In most cases, by the time we get to an insertvalue, it will be unused
+ * because of extractvalue removal.
+ *
+ * In a case where it is still used (probably because this function has an
+ * arg or return value that is a struct, or we call a function like that),
+ * the struct value is dealt with in register allocation.
+ */
+bool GenXLowering::lowerInsertValue(InsertValueInst *Inst) {
+  if (Inst->use_empty()) {
+    ToErase.push_back(Inst);
+    return true;
+  }
+  return false;
+}
+
+/***********************************************************************
+ * lowerUAddWithOverflow : lower llvm.uadd.with.overflow
+ *
+ * This could potentially be implemented with the vISA addc instruction.
+ * However an intrinsic for that would need extra GenX backend support for
+ * returning a struct containing two vectors, and that support does not exist
+ * now.
+ *
+ * So for now we use the old DEC Alpha trick of comparing the result with
+ * one of the operands.
+ */
+bool GenXLowering::lowerUAddWithOverflow(CallInst *CI) {
+  const DebugLoc &DL = CI->getDebugLoc();
+  // Do the add.
+  auto Add =
+      BinaryOperator::Create(Instruction::Add, CI->getArgOperand(0),
+                             CI->getArgOperand(1), CI->getName() + ".add", CI);
+  Add->setDebugLoc(DL);
+  // Do the comparison. (An unsigned add has overflowed if the result is
+  // smaller than one of the operands, and, if it has overflowed, the result
+  // is smaller than both of the operands. So it doesn't matter which operand
+  // we use for the comparison.)
+  auto Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, Add,
+                             CI->getArgOperand(1), CI->getName() + ".cmp", CI);
+  Cmp->setDebugLoc(DL);
+  // For any extractvalue use of the result of the original add with overflow,
+  // replace it directly.
+  SmallVector<ExtractValueInst *, 4> Extracts;
+  for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui)
+    if (auto EVI = dyn_cast<ExtractValueInst>(ui->getUser()))
+      Extracts.push_back(EVI);
+  for (auto ei = Extracts.begin(), ee = Extracts.end(); ei != ee; ++ei) {
+    auto EVI = *ei;
+    EVI->replaceAllUsesWith(EVI->getIndices()[0] ? (Value *)Cmp : (Value *)Add);
+    EVI->setOperand(0, UndefValue::get(CI->getType()));
+    ToErase.push_back(EVI);
+  }
+  // If any uses of the original intrinsic remain, recreate the struct value.
+  if (!CI->use_empty()) {
+    auto Insert = InsertValueInst::Create(UndefValue::get(CI->getType()), Add,
+                                          0, CI->getName() + ".insertadd", CI);
+    Insert->setDebugLoc(DL);
+    Insert = InsertValueInst::Create(Insert, Cmp, 1,
+                                     CI->getName() + ".insertcmp", CI);
+    Insert->setDebugLoc(DL);
+    // ... and use it to replace the original intrinsic.
+    CI->replaceAllUsesWith(Insert);
+  }
+  ToErase.push_back(CI);
+  return true;
+}
+
+bool GenXLowering::lowerTrap(CallInst *CI) {
+  Module *M = CI->getModule();
+  IRBuilder<> Builder(CI);
+  auto &Ctx = CI->getContext();
+  unsigned EMWidth = 32;
+  Type *ArgTypes[] = {VectorType::get(Type::getInt1Ty(Ctx), EMWidth),
+    VectorType::get(Type::getInt16Ty(Ctx), EMWidth)};
+  auto Fn = GenXIntrinsic::getGenXDeclaration(M,
+    GenXIntrinsic::genx_raw_send_noresult, ArgTypes);
+  SmallVector<Value *, 8> Args;
+  // send
+  Args.push_back(ConstantInt::get(Type::getInt32Ty(Ctx), 0));
+  // predicate all lanes
+  Args.push_back(ConstantVector::getSplat(EMWidth, ConstantInt::getTrue(Ctx)));
+  // EOT
+  Args.push_back(ConstantInt::get(Type::getInt32Ty(Ctx), 0x27));
+  Args.push_back(ConstantInt::get(Type::getInt32Ty(Ctx), 0x02000010));
+  Args.push_back(ConstantVector::getSplat(EMWidth, Constant::getNullValue(Type::getInt16Ty(Ctx))));
+  Builder.CreateCall(Fn, Args);
+  ToErase.push_back(CI);
+
+  return true;
+}
+
+bool GenXLowering::lowerCtpop(CallInst *CI) {
+  Module *M = CI->getModule();
+  IRBuilder<> Builder(CI);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Type *Int32Ty = IntegerType::getInt32Ty(CI->getContext());
+  Type *RetTy = nullptr;
+  if (auto *VT = dyn_cast<VectorType>(CI->getType()))
+    RetTy = VectorType::get(Int32Ty, VT->getNumElements());
+  else
+    RetTy = Int32Ty;
+
+  auto *CBitDecl = GenXIntrinsic::getGenXDeclaration(
+      M, GenXIntrinsic::genx_cbit, {RetTy, CI->getType()});
+  Value *CBitInst =
+      Builder.CreateCall(CBitDecl, CI->getOperand(0), CI->getName());
+
+  CBitInst = Builder.CreateZExtOrTrunc(CBitInst, CI->getType());
+  CI->replaceAllUsesWith(CBitInst);
+  ToErase.push_back(CI);
+
+  return true;
+}
+
+// Lower cmp instructions that GenX cannot deal with.
+bool GenXLowering::lowerFCmpInst(FCmpInst *Inst) {
+  IRBuilder<> Builder(Inst);
+  Builder.SetCurrentDebugLocation(Inst->getDebugLoc());
+  Value *Ops[] = {Inst->getOperand(0), Inst->getOperand(1)};
+
+  switch (Inst->getPredicate()) {
+  default:
+    break;
+  case CmpInst::FCMP_ORD: // True if ordered (no nans)
+  {
+    // %c = fcmp ord %a %b
+    // =>
+    // %1 = fcmp oeq %a %a
+    // %2 = fcmp oeq %b %b
+    // %c = and %1 %2
+    Value *LHS = Builder.CreateFCmpOEQ(Ops[0], Ops[0]);
+    Value *RHS = Builder.CreateFCmpOEQ(Ops[1], Ops[1]);
+    Value *New = Builder.CreateAnd(LHS, RHS);
+    Inst->replaceAllUsesWith(New);
+    ToErase.push_back(Inst);
+    return true;
+  }
+  case CmpInst::FCMP_UNO: // True if unordered: isnan(X) | isnan(Y)
+    // %c = fcmp uno %a %b
+    // =>
+    // %1 = fcmp une %a %a
+    // %2 = fcmp une %b %b
+    // %c = or %1 %2
+    Value *LHS = Builder.CreateFCmpUNE(Ops[0], Ops[0]);
+    Value *RHS = Builder.CreateFCmpUNE(Ops[1], Ops[1]);
+    Value *New = Builder.CreateOr(LHS, RHS);
+    Inst->replaceAllUsesWith(New);
+    ToErase.push_back(Inst);
+    return true;
+  }
+
+  return false;
+}
+
+// Lower cmp instructions that GenX cannot deal with.
+bool GenXLowering::lowerMul64(Instruction *Inst) {
+  IRBuilder<> Builder(Inst);
+  Builder.SetCurrentDebugLocation(Inst->getDebugLoc());
+  auto Src0 = Inst->getOperand(0);
+  auto Src1 = Inst->getOperand(1);
+  auto ETy = Src0->getType();
+  auto Len = 1;
+  if (ETy->isVectorTy()) {
+    Len = ETy->getVectorNumElements();
+    ETy = ETy->getVectorElementType();
+  }
+  if (!ETy->isIntegerTy() || ETy->getPrimitiveSizeInBits() != 64)
+    return false;
+  auto VTy = VectorType::get(ETy->getInt32Ty(Inst->getContext()), Len * 2);
+  // create src0 bitcast, then the low and high part
+  auto Src0V = Builder.CreateBitCast(Src0, VTy);
+  Region R(Inst);
+  R.Offset = 0;
+  R.Width = Len;
+  R.NumElements = Len;
+  R.Stride = 2;
+  R.VStride = 0;
+  auto Src0L = R.createRdRegion(Src0V, "", Inst, Inst->getDebugLoc());
+  R.Offset = 4;
+  auto Src0H = R.createRdRegion(Src0V, "", Inst, Inst->getDebugLoc());
+  // create src1 bitcast, then the low and high part
+  auto Src1V = Builder.CreateBitCast(Src1, VTy);
+  R.Offset = 0;
+  auto Src1L = R.createRdRegion(Src1V, "", Inst, Inst->getDebugLoc());
+  R.Offset = 4;
+  auto Src1H = R.createRdRegion(Src1V, "", Inst, Inst->getDebugLoc());
+  // create muls and adds
+  auto ResL = Builder.CreateMul(Src0L, Src1L);
+  // create the mulh intrinsic to the get the carry-part
+  Type *tys[2];
+  SmallVector<llvm::Value *, 2> args;
+  // build type-list
+  tys[0] = ResL->getType();
+  tys[1] = Src0L->getType();
+  // build argument list
+  args.push_back(Src0L);
+  args.push_back(Src1L);
+  auto M = Inst->getParent()->getParent()->getParent();
+  Function *IntrinFunc =
+      GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_umulh, tys);
+  Instruction *Cari = CallInst::Create(IntrinFunc, args, "", Inst);
+  Cari->setDebugLoc(Inst->getDebugLoc());
+  auto Temp0 = Builder.CreateMul(Src0L, Src1H);
+  auto Temp1 = Builder.CreateAdd(Cari, Temp0);
+  auto Temp2 = Builder.CreateMul(Src0H, Src1L);
+  auto ResH = Builder.CreateAdd(Temp2, Temp1);
+  // create the write-regions
+  auto UndefV = UndefValue::get(VTy);
+  R.Offset = 0;
+  auto WrL = R.createWrRegion(UndefV, ResL, "WrLow", Inst, Inst->getDebugLoc());
+  R.Offset = 4;
+  auto WrH = R.createWrRegion(WrL, ResH, "WrHigh", Inst, Inst->getDebugLoc());
+  // create the bitcast to the destination-type
+  auto Replace = Builder.CreateBitCast(WrH, Inst->getType(), "mul64");
+  Inst->replaceAllUsesWith(Replace);
+  ToErase.push_back(Inst);
+  return true;
+}
+/***********************************************************************
+ * widenByteOp : widen a vector byte operation to short if that might
+ *               improve code
+ *
+ * Return:  whether any change was made, and thus the current instruction
+ *          is now marked for erasing
+ *
+ * Gen has restrictions on byte operands. The jitter copes with that, but
+ * sometimes it needs to do even-odd splitting, which can lead to suboptimal
+ * code if cmps and predicates are involved.
+ * Here we attempt to pick up the common cases by converting a byte operation
+ * to short.
+ *
+ * Note that we might end up with the extends being baled into the instruction
+ * anyway, resulting in a byte operation in vISA.
+ */
+bool GenXLowering::widenByteOp(Instruction *Inst) {
+  if (!EnableGenXByteWidening)
+    return false;
+  Type *Ty = Inst->getType();
+  if (isa<CmpInst>(Inst))
+    Ty = Inst->getOperand(0)->getType();
+  if (!isa<VectorType>(Ty) || !Ty->getScalarType()->isIntegerTy(8))
+    return false; // not byte operation
+  if (Inst->use_empty())
+    return false; // result unused
+  // check use, if use is a phi, stop widenning
+  if (!isa<CmpInst>(Inst)) {
+    for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+      auto User = cast<Instruction>(ui->getUser());
+      if (isa<PHINode>(User))
+        return false;
+    }
+  }
+  // For a predicated wrregion, widen by separating the predication into a
+  // rdregion and select, which can then be widened.
+  if (GenXIntrinsic::isWrRegion(Inst)) {
+    Region R(Inst, BaleInfo());
+    if (R.NumElements == 1 || !R.Mask)
+      return false;
+    // Can only do this if the predicate is the right size. (We could handle
+    // the wrong size case by adding an rdpredregion, but then we would need
+    // to ensure that GenXLegalization can cope with an arbitrary size
+    // rdpredregion.)
+    if (R.Mask->getType()->getVectorNumElements() != R.NumElements)
+      return false;
+    // Create the rdregion and select.
+    auto NewRd =
+        R.createRdRegion(Inst->getOperand(0), Inst->getName() + ".byteselrdr",
+                         Inst, Inst->getDebugLoc());
+    auto NewSel =
+        SelectInst::Create(R.Mask, Inst->getOperand(1), NewRd, "", Inst);
+    NewSel->takeName(Inst);
+    NewSel->setDebugLoc(Inst->getDebugLoc());
+    // Modify the existing wrregion.
+    Inst->setName(NewSel->getName() + ".byteselwrr");
+    Inst->setOperand(1, NewSel);
+    Inst->setOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum,
+                     Constant::getAllOnesValue(R.Mask->getType()));
+    // Fall through for the select to get widened.
+    Inst = NewSel;
+  }
+  // Do the widening for:
+  // 1. a compare or select
+  // 2. used in a zext that indicates that the user has probably already been
+  //    widened by this code.
+  bool Widen = false;
+  if (isa<CmpInst>(Inst) || isa<SelectInst>(Inst))
+    Widen = true;
+  else {
+    auto user = cast<Instruction>(Inst->use_begin()->getUser());
+    if (isa<ZExtInst>(user))
+      Widen = true;
+  }
+  if (!Widen)
+    return false;
+  // Widen to short.
+  // Decide whether to zero or sign extend. Also decide whether the result is
+  // guaranteed to have all 0 bits in the extended part.
+  Instruction::CastOps ExtOpcode = Instruction::ZExt;
+  bool ExtendedIsZero = false;
+  switch (Inst->getOpcode()) {
+  case Instruction::SDiv:
+  case Instruction::AShr:
+    ExtOpcode = Instruction::SExt;
+    break;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::LShr:
+    ExtendedIsZero = true;
+    break;
+  case Instruction::ICmp:
+    if (cast<CmpInst>(Inst)->isSigned())
+      ExtOpcode = Instruction::SExt;
+    break;
+  default:
+    break;
+  }
+  // Get the range of operands to process.
+  unsigned StartIdx = 0, EndIdx = Inst->getNumOperands();
+  if (auto CI = dyn_cast<CallInst>(Inst))
+    EndIdx = CI->getNumArgOperands();
+  else if (isa<SelectInst>(Inst))
+    StartIdx = 1;
+  // Extend the operands.
+  auto ExtTy = VectorType::get(
+      Type::getInt16Ty(Inst->getContext()),
+      Inst->getOperand(StartIdx)->getType()->getVectorNumElements());
+  SmallVector<Value *, 4> Opnds;
+  for (unsigned Idx = 0; Idx != EndIdx; ++Idx) {
+    Value *Opnd = Inst->getOperand(Idx);
+    if (Idx >= StartIdx) {
+      if (auto C = dyn_cast<Constant>(Opnd))
+        Opnd = ConstantExpr::getCast(ExtOpcode, C, ExtTy);
+      else {
+        auto NewExt = CastInst::Create(ExtOpcode, Opnd, ExtTy,
+                                       Inst->getName() + ".byteext", Inst);
+        NewExt->setDebugLoc(Inst->getDebugLoc());
+        Opnd = NewExt;
+      }
+    }
+    Opnds.push_back(Opnd);
+  }
+  // Create the replacement instruction.
+  Instruction *NewInst = nullptr;
+  if (isa<BinaryOperator>(Inst))
+    NewInst = BinaryOperator::Create((Instruction::BinaryOps)Inst->getOpcode(),
+                                     Opnds[0], Opnds[1], "", Inst);
+  else if (auto CI = dyn_cast<CmpInst>(Inst))
+    NewInst = CmpInst::Create(CI->getOpcode(), CI->getPredicate(), Opnds[0],
+                              Opnds[1], "", CI);
+  else if (isa<SelectInst>(Inst))
+    NewInst = SelectInst::Create(Opnds[0], Opnds[1], Opnds[2], "", Inst);
+  else
+    llvm_unreachable("unhandled instruction in widenByteOp");
+  NewInst->takeName(Inst);
+  NewInst->setDebugLoc(Inst->getDebugLoc());
+  if (ExtendedIsZero) {
+    // We know that the extended part of the result contains 0 bits. If we
+    // find that any use is a zext (probably from also being byte widened
+    // in this code), we can replace the use directly and save the
+    // trunc/zext pair. First put the uses in a vector as the use list will
+    // change under our feet.
+    SmallVector<Use *, 4> Uses;
+    for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui)
+      Uses.push_back(&*ui);
+    for (auto ui = Uses.begin(), ue = Uses.end(); ui != ue; ++ui) {
+      if (auto user = dyn_cast<ZExtInst>((*ui)->getUser())) {
+        if (user->getType() == NewInst->getType()) {
+          user->replaceAllUsesWith(NewInst);
+          ToErase.push_back(user);
+          // Remove the use of Inst from the trunc so we can tell whether there
+          // are any uses left below.
+          *(*ui) = UndefValue::get(Inst->getType());
+        }
+      }
+    }
+  }
+  if (!Inst->use_empty()) {
+    // Truncate the result.
+    if (!isa<CmpInst>(Inst)) {
+      NewInst = CastInst::Create(Instruction::Trunc, NewInst, Inst->getType(),
+                                 Inst->getName() + ".bytetrunc", Inst);
+      NewInst->setDebugLoc(Inst->getDebugLoc());
+    }
+    // Replace uses.
+    Inst->replaceAllUsesWith(NewInst);
+  }
+  ToErase.push_back(Inst);
+  return true;
+}
+
+static bool breakConstantVector(unsigned i, Instruction *CurInst,
+                                Instruction *InsertPt) {
+  ConstantVector *CV = cast<ConstantVector>(CurInst->getOperand(i));
+
+  // Splat case.
+  if (auto S = dyn_cast_or_null<ConstantExpr>(CV->getSplatValue())) {
+    // Turn element into an instruction
+    auto Inst = S->getAsInstruction();
+    Inst->setDebugLoc(CurInst->getDebugLoc());
+    Inst->insertBefore(InsertPt);
+    Type *NewTy = VectorType::get(Inst->getType(), 1);
+    Inst = CastInst::Create(Instruction::CastOps::BitCast, Inst, NewTy, "",
+                            CurInst);
+    Inst->setDebugLoc(CurInst->getDebugLoc());
+
+    // Splat this value.
+    Region R(Inst);
+    R.Offset = 0;
+    R.Width = 1;
+    R.Stride = R.VStride = 0;
+    R.NumElements = CV->getNumOperands();
+    Inst = R.createRdRegion(Inst, "", InsertPt /*InsertBefore*/,
+                            Inst->getDebugLoc());
+
+    // Update i-th operand with newly created splat.
+    CurInst->setOperand(i, Inst);
+    return true;
+  }
+
+  SmallVector<Value *, 8> Vals;
+  bool HasConstExpr = false;
+  for (unsigned j = 0, N = CV->getNumOperands(); j < N; ++j) {
+    Value *Elt = CV->getOperand(j);
+    if (auto CE = dyn_cast<ConstantExpr>(Elt)) {
+      auto Inst = CE->getAsInstruction();
+      Inst->setDebugLoc(CurInst->getDebugLoc());
+      Inst->insertBefore(InsertPt);
+      Vals.push_back(Inst);
+      HasConstExpr = true;
+    } else
+      Vals.push_back(Elt);
+  }
+
+  if (HasConstExpr) {
+    Value *Val = UndefValue::get(CV->getType());
+    for (unsigned j = 0, N = CV->getNumOperands(); j < N; ++j) {
+      Region R(Vals[j]);
+      R.Offset = j * R.ElementBytes;
+      Val =
+          R.createWrRegion(Val, Vals[j], "", InsertPt, CurInst->getDebugLoc());
+    }
+    CurInst->setOperand(i, Val);
+    return true;
+  }
+
+  return false;
+}
+
+bool genx::breakConstantExprs(Function *F) {
+  bool Modified = false;
+  for (po_iterator<BasicBlock *> i = po_begin(&F->getEntryBlock()),
+                                 e = po_end(&F->getEntryBlock());
+       i != e; ++i) {
+    BasicBlock *BB = *i;
+    // The effect of this loop is that we process the instructions in reverse
+    // order, and we re-process anything inserted before the instruction
+    // being processed.
+    for (Instruction *CurInst = BB->getTerminator(); CurInst;) {
+      PHINode *PN = dyn_cast<PHINode>(CurInst);
+      for (unsigned i = 0, e = CurInst->getNumOperands(); i < e; ++i) {
+        Instruction *InsertPt =
+            PN ? PN->getIncomingBlock(i)->getTerminator() : CurInst;
+        Value *Op = CurInst->getOperand(i);
+        if (getUnderlyingGlobalVariable(Op) != nullptr)
+          continue;
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op)) {
+          Instruction *NewInst = CE->getAsInstruction();
+          NewInst->setDebugLoc(CurInst->getDebugLoc());
+          NewInst->insertBefore(CurInst);
+          CurInst->setOperand(i, NewInst);
+          Modified = true;
+        } else if (isa<ConstantVector>(Op))
+          Modified |= breakConstantVector(i, CurInst, InsertPt);
+      }
+      CurInst = CurInst == &BB->front() ? nullptr : CurInst->getPrevNode();
+    }
+  }
+  return Modified;
+}
+
+namespace {
+
+// Helper class to translate load/store into proper GenX intrinsic calls.
+class LoadStoreResolver {
+  Instruction *Inst;
+  const GenXSubtarget *ST;
+  IRBuilder<> Builder;
+
+public:
+  LoadStoreResolver(Instruction *Inst, const GenXSubtarget *ST)
+      : Inst(Inst), ST(ST), Builder(Inst) {}
+
+  // Resolve this instruction and return true on success.
+  bool resolve();
+
+private:
+  bool isLoad() const { return isa<LoadInst>(Inst); }
+  bool isStore() const { return isa<StoreInst>(Inst); }
+
+  const DataLayout &getDL() const {
+    Function *F = Inst->getParent()->getParent();
+    return F->getParent()->getDataLayout();
+  }
+
+  // Find a proper GenX intrinsic ID for this load/store instruction.
+  GenXIntrinsic::ID getGenXIntrinsicID() const;
+
+  unsigned getPointerSizeInBits() const {
+    unsigned AddrSp = 0;
+    if (auto LI = dyn_cast<LoadInst>(Inst))
+      AddrSp = LI->getPointerAddressSpace();
+    else if (auto SI = dyn_cast<StoreInst>(Inst))
+      AddrSp = SI->getPointerAddressSpace();
+    return getDL().getPointerSizeInBits(AddrSp);
+  }
+
+  unsigned getValueSizeInBits(Type *T) const {
+    if (auto PT = dyn_cast<PointerType>(T)) {
+      unsigned AddrSp = PT->getAddressSpace();
+      return getDL().getPointerSizeInBits(AddrSp);
+    }
+    return T->getPrimitiveSizeInBits();
+  }
+
+  // Return true if this load/store can be translated.
+  bool isSupported() const;
+
+  // Emit actual intrinsic calls.
+  bool emitGather();
+  bool emitScatter();
+  bool emitSVMGather();
+  bool emitSVMScatter();
+};
+
+} // namespace
+
+// Translate store instructions into genx builtins.
+bool GenXLowering::lowerLoadStore(Instruction *Inst) {
+  auto ST = getAnalysisIfAvailable<GenXSubtargetPass>();
+  LoadStoreResolver Resolver(Inst, ST ? ST->getSubtarget() : nullptr);
+  if (Resolver.resolve()) {
+    ToErase.push_back(Inst);
+    return true;
+  }
+  return false;
+}
+
+bool LoadStoreResolver::resolve() {
+  if (!isSupported())
+    return false;
+
+  GenXIntrinsic::ID ID = getGenXIntrinsicID();
+  switch (ID) {
+  case GenXIntrinsic::genx_gather_scaled:
+    return emitGather();
+  case GenXIntrinsic::genx_scatter_scaled:
+    return emitScatter();
+  case GenXIntrinsic::genx_svm_gather:
+    return emitSVMGather();
+  case GenXIntrinsic::genx_svm_scatter:
+    return emitSVMScatter();
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// Return true if this load/store can be translated.
+bool LoadStoreResolver::isSupported() const {
+  auto IsGlobalLoadStore = [=]() {
+    Value *Ptr = nullptr;
+    if (auto LI = dyn_cast<LoadInst>(Inst))
+      Ptr = LI->getPointerOperand();
+    if (auto SI = dyn_cast<StoreInst>(Inst))
+      Ptr = SI->getPointerOperand();
+    return getUnderlyingGlobalVariable(Ptr) != nullptr;
+  };
+
+  if (IsGlobalLoadStore())
+    return false;
+
+  Type *ValTy = Inst->getType();
+  if (auto SI = dyn_cast<StoreInst>(Inst))
+    ValTy = SI->getValueOperand()->getType();
+
+  // Only scalar data types.
+  if (!ValTy->isFloatingPointTy() && !ValTy->isIntegerTy() &&
+      !ValTy->isPointerTy()) {
+    Inst->getContext().emitError(Inst, "unsupported type for load/store");
+    return false;
+  }
+
+  // Only legal types: float, double, half, i8, i16, 132, i64, pointer types.
+  unsigned NumBits = getValueSizeInBits(ValTy);
+  if (NumBits < 8 || NumBits > 64 || !isPowerOf2_32(NumBits)) {
+    Inst->getContext().emitError("unsupported integer type for load/store");
+    return false;
+  }
+
+  // Translate this instruction.
+  return true;
+}
+
+// Find a proper GenX intrinsic ID for this load/store instruction.
+GenXIntrinsic::ID LoadStoreResolver::getGenXIntrinsicID() const {
+  // A32 byte scattered stateless messages only work on CNL+.
+  unsigned NBits = getPointerSizeInBits();
+  if (NBits == 32 && ST && !ST->WaNoA32ByteScatteredStatelessMessages())
+    return isLoad() ? GenXIntrinsic::genx_gather_scaled
+                    : GenXIntrinsic::genx_scatter_scaled;
+  return isLoad() ? GenXIntrinsic::genx_svm_gather
+                  : GenXIntrinsic::genx_svm_scatter;
+}
+
+bool LoadStoreResolver::emitGather() {
+  unsigned NBits = getPointerSizeInBits();
+  Type *IntTy = IntegerType::get(Inst->getContext(), NBits);
+  auto LI = cast<LoadInst>(Inst);
+
+  // Global offset.
+  Value *Addr = LI->getPointerOperand();
+  Addr = Builder.CreatePtrToInt(Addr, IntTy);
+
+  unsigned NBlocks = getValueSizeInBits(LI->getType()) / 8;
+  unsigned NBlocksLog2 = llvm::Log2_32(NBlocks);
+
+  // If this is more than 4 bytes, use a larger SIMD size.
+  unsigned SIMD = 1;
+  if (NBlocks > 4) {
+    SIMD = NBlocks / 4;
+    NBlocksLog2 = 2;
+  }
+
+  // The old value is undef.
+  Type *ValTy = LI->getType();
+  if (ValTy->isPointerTy())
+    ValTy = Builder.getIntNTy(getValueSizeInBits(ValTy));
+  Type *DataTy = VectorType::get(ValTy, 1);
+  if (SIMD > 1)
+    DataTy = VectorType::get(Builder.getInt32Ty(), SIMD);
+  Value *OldVal = UndefValue::get(DataTy);
+
+  // Offset.
+  Type *EltOffsetTy = VectorType::get(Builder.getInt32Ty(), SIMD);
+  Value *EltOffset = Constant::getNullValue(EltOffsetTy);
+  if (SIMD > 1) {
+    SmallVector<uint32_t, 2> Offsets(SIMD);
+    for (unsigned i = 0; i < SIMD; ++i)
+      // Increase offset by 4 bytes for each lane.
+      Offsets[i] = i * 4;
+    EltOffset = ConstantDataVector::get(Inst->getContext(), Offsets);
+  }
+
+  // Arguments.
+  Value *Args[] = {
+      Constant::getAllOnesValue(VectorType::get(Builder.getInt1Ty(), SIMD)),
+      Builder.getInt32(NBlocksLog2),   // log[2](NBlocks)
+      Builder.getInt16(0),             // scale
+      Builder.getInt32(visa::getReservedSurfaceIndex(
+                      PreDefined_Surface::PREDEFINED_SURFACE_T255)), // surface
+      Addr,                            // global offset
+      EltOffset,                       // element offset
+      OldVal                           // old value
+  };
+
+  // Overload with return type, predicate type and element offset type
+  Type *Tys[] = {OldVal->getType(), Args[0]->getType(), EltOffsetTy};
+  Module *M = Inst->getParent()->getParent()->getParent();
+  auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_gather_scaled, Tys);
+
+  Value *NewVal = Builder.CreateCall(Fn, Args);
+  NewVal = Builder.CreateBitCast(NewVal, ValTy);
+  LI->replaceAllUsesWith(NewVal);
+  return true;
+}
+
+bool LoadStoreResolver::emitScatter() {
+  unsigned NBits = getPointerSizeInBits();
+  Type *IntTy = IntegerType::get(Inst->getContext(), NBits);
+  auto SI = cast<StoreInst>(Inst);
+
+  // Global offset.
+  Value *Addr = SI->getPointerOperand();
+  Addr = Builder.CreatePtrToInt(Addr, IntTy);
+
+  Value *Val = SI->getValueOperand();
+  unsigned NBlocks = getValueSizeInBits(Val->getType()) / 8;
+  unsigned NBlocksLog2 = llvm::Log2_32(NBlocks);
+
+  // If this is more than 4 bytes, use a larger SIMD size.
+  unsigned SIMD = 1;
+  if (NBlocks > 4) {
+    SIMD = NBlocks / 4;
+    NBlocksLog2 = 2;
+  }
+
+  // Value to write.
+  Type *ValTy = (SIMD > 1) ? Builder.getInt32Ty() : Val->getType();
+  if (ValTy->isPointerTy())
+    ValTy = Builder.getIntNTy(getValueSizeInBits(ValTy));
+  Val = Builder.CreateBitCast(Val, VectorType::get(ValTy, SIMD));
+
+  // Offset.
+  Type *EltOffsetTy = VectorType::get(Builder.getInt32Ty(), SIMD);
+  Value *EltOffset = Constant::getNullValue(EltOffsetTy);
+  if (SIMD > 1) {
+    SmallVector<uint32_t, 2> Offsets(SIMD);
+    // Increase offset by 4 bytes for each lane.
+    for (unsigned i = 0; i < SIMD; ++i)
+      Offsets[i] = i * 4;
+    EltOffset = ConstantDataVector::get(Inst->getContext(), Offsets);
+  }
+
+  // Arguments.
+  Value *Args[] = {
+      Constant::getAllOnesValue(VectorType::get(Builder.getInt1Ty(), SIMD)),
+      Builder.getInt32(NBlocksLog2),   // log[2](NBlocks)
+      Builder.getInt16(0),             // scale
+      Builder.getInt32(visa::getReservedSurfaceIndex(
+                      PreDefined_Surface::PREDEFINED_SURFACE_T255)), // surface
+      Addr,                            // global offset
+      EltOffset,                       // element offset
+      Val                              // value to write
+  };
+
+  // Overload with predicate type, element offset type, value to write type.
+  Type *Tys[] = {Args[0]->getType(), EltOffsetTy, Val->getType()};
+  Module *M = Inst->getParent()->getParent()->getParent();
+  auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_scatter_scaled, Tys);
+  Builder.CreateCall(Fn, Args);
+  return true;
+}
+
+// Compute the block size and the number of blocks for svm gather/scatter.
+//
+// Block_Size, 1, 4, 8
+// Num_Blocks, 1, 2, 4,
+//             8 only valid for 4 byte blocks and execution size 8.
+//
+static unsigned getBlockCount(Type *Ty) {
+  unsigned NumBytes = Ty->getPrimitiveSizeInBits() / 8;
+  assert(NumBytes <= 8 && "out of sync");
+
+  // If this is N = 2 byte data, use 2 blocks;
+  // otherwise, use 1 block of N bytes.
+  return (NumBytes == 2) ? NumBytes : 1U;
+}
+
+// Translate store to svm scatter.
+bool LoadStoreResolver::emitSVMGather() {
+  unsigned NBits = getPointerSizeInBits();
+  Type *IntTy = IntegerType::get(Inst->getContext(), NBits);
+  auto LI = cast<LoadInst>(Inst);
+
+  // Address.
+  Value *Addr = LI->getPointerOperand();
+  Addr = Builder.CreatePtrToInt(Addr, IntTy);
+  if (NBits == 32)
+    Addr = Builder.CreateZExt(Addr, Builder.getInt64Ty());
+  Addr = Builder.CreateBitCast(Addr, VectorType::get(Addr->getType(), 1));
+
+  // The old value is undef.
+  Type *ValTy = LI->getType();
+  if (ValTy->isPointerTy())
+    ValTy = Builder.getIntNTy(getValueSizeInBits(ValTy));
+  Type *DataTy = VectorType::get(ValTy, 1);
+  Value *OldVal = UndefValue::get(DataTy);
+
+  // Num of blocks.
+  unsigned NBlocks = getBlockCount(OldVal->getType());
+  unsigned NBlocksLog2 = llvm::Log2_32(NBlocks);
+
+  Value *Args[] = {
+      Constant::getAllOnesValue(VectorType::get(Builder.getInt1Ty(), 1)),
+      Builder.getInt32(NBlocksLog2), // log2(num_of_blocks)
+      Addr,                          // addresses
+      OldVal                         // old value
+  };
+
+  // Overload with return type, predicate type and address vector type
+  Type *Tys[] = {OldVal->getType(), Args[0]->getType(), Addr->getType()};
+  Module *M = Inst->getParent()->getParent()->getParent();
+  auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_svm_gather, Tys);
+
+  Value *NewVal = Builder.CreateCall(Fn, Args);
+  NewVal = Builder.CreateBitCast(NewVal, ValTy);
+  if (LI->getType()->isPointerTy())
+    NewVal = Builder.CreateIntToPtr(NewVal, LI->getType());
+  LI->replaceAllUsesWith(NewVal);
+  return true;
+}
+
+bool LoadStoreResolver::emitSVMScatter() {
+  unsigned NBits = getPointerSizeInBits();
+  Type *IntTy = IntegerType::get(Inst->getContext(), NBits);
+  auto SI = cast<StoreInst>(Inst);
+
+  // Address
+  Value *Addr = SI->getPointerOperand();
+  Addr = Builder.CreatePtrToInt(Addr, IntTy);
+  if (NBits == 32)
+    Addr = Builder.CreateZExt(Addr, Builder.getInt64Ty());
+  Addr = Builder.CreateBitCast(Addr, VectorType::get(Addr->getType(), 1));
+
+  // data to write.
+  Value *Val = SI->getValueOperand();
+  Type *ValTy = Val->getType();
+  if (ValTy->isPointerTy()) {
+    ValTy = Builder.getIntNTy(getValueSizeInBits(ValTy));
+    Val = Builder.CreatePtrToInt(Val, ValTy);
+  }
+  Val = Builder.CreateBitCast(Val, VectorType::get(ValTy, 1));
+
+  // Num of blocks.
+  unsigned NBlocks = getBlockCount(Val->getType());
+  unsigned NBlocksLog2 = llvm::Log2_32(NBlocks);
+
+  Value *Args[] = {
+      Constant::getAllOnesValue(VectorType::get(Builder.getInt1Ty(), 1)),
+      Builder.getInt32(NBlocksLog2), // log2(num_of_blocks)
+      Addr,                          // addresses
+      Val                            // value to write
+  };
+
+  // Overload with predicate type, address vector type, and data type
+  Type *Tys[] = {Args[0]->getType(), Addr->getType(), Val->getType()};
+  Module *M = Inst->getParent()->getParent()->getParent();
+  auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_svm_scatter, Tys);
+
+  Builder.CreateCall(Fn, Args);
+  return true;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.cpp
new file mode 100644
index 000000000000..8cec949b1f55
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.cpp
@@ -0,0 +1,140 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// GenXModule is a module pass whose purpose is to store information
+// about the GenX module being written, such as the built kernels and functions.
+// See the comment in GenXModule.h.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_MODULE"
+
+#include "GenXModule.h"
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXSubtarget.h"
+#include "GenXWATable.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/GenXIntrinsics/GenXMetadata.h"
+
+#include <set>
+
+using namespace llvm;
+
+char GenXModule::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXModule, "GenXModule", "GenXModule", false,
+                      true /*analysis*/)
+INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis)
+INITIALIZE_PASS_DEPENDENCY(GenXWATable)
+INITIALIZE_PASS_END(GenXModule, "GenXModule", "GenXModule", false,
+                    true /*analysis*/)
+
+ModulePass *llvm::createGenXModulePass() {
+  initializeGenXModulePass(*PassRegistry::getPassRegistry());
+  return new GenXModule;
+}
+
+void GenXModule::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<FunctionGroupAnalysis>();
+  AU.addRequired<GenXWATable>();
+  AU.setPreservesAll();
+}
+
+bool GenXModule::CheckForInlineAsm(Module &M) const {
+  for (auto &F : M)
+    for (auto &BB : F)
+      for (auto &I : BB) {
+        CallInst *CI = dyn_cast<CallInst>(&I);
+        if (CI && CI->isInlineAsm())
+          return true;
+      }
+  return false;
+}
+
+/***********************************************************************
+ * runOnModule : run GenXModule analysis
+ *
+ * This populates FunctionGroupAnalysis such that each FunctionGroup
+ * corresponds to a GenX kernel/function and its subroutines. If any
+ * subroutine would be used in more than one FunctionGroup, it is
+ * cloned.
+ *
+ * The FunctionGroup is populated in an order such that a function appears
+ * after all its callers.
+ */
+bool GenXModule::runOnModule(Module &M) {
+  auto FGA = &getAnalysis<FunctionGroupAnalysis>();
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  ST = P ? P->getSubtarget() : nullptr;
+  WaTable = getAnalysis<GenXWATable>().getWATable();
+  Ctx = &M.getContext();
+
+  InlineAsm = CheckForInlineAsm(M);
+
+  // Iterate, processing each Function that is not yet assigned to a
+  // FunctionGroup.
+  bool ModuleModified = false;
+
+  // build callgraph and process subgroups
+  std::map<Function *, std::list<Function *>> CG;
+  // TODO: for now it's a temporary workaround of strange ArgIndirection
+  // problems that it depends on order of functions withing a group
+  // This should be removed once indirection is fixed
+  std::map<Function *, std::set<Function*>> Visited;
+
+  for (auto T : FGA->TypesToProcess) {
+    for (auto &F : M) {
+      for (auto *U: F.users()) {
+        auto *Inst = dyn_cast<Instruction>(U);
+        if (!Inst) {
+          continue;
+        }
+        if (!F.empty() && Visited[Inst->getFunction()].count(&F) == 0) {
+          CG[Inst->getFunction()].push_back(&F);
+          Visited[Inst->getFunction()].insert(&F);
+        }
+        // recursive funcs must use stack
+        if (Inst->getFunction() == &F)
+          assert(F.hasFnAttribute(genx::FunctionMD::CMStackCall) &&
+                 "Found recursive function without CMStackCall attribute");
+      }
+    }
+
+    for (auto &F : M) {
+      if (F.empty() || F.getLinkage() == GlobalValue::InternalLinkage)
+        continue;
+      ModuleModified |= FGA->buildGroup(CG, &F, nullptr, T);
+    }
+
+    FGA->clearVisited();
+    CG.clear();
+    Visited.clear();
+  }
+
+  return ModuleModified;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.h
new file mode 100644
index 000000000000..bf95364457e4
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.h
@@ -0,0 +1,185 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXModule
+/// ----------
+///
+/// GenXModule is a module pass whose purpose is to store information
+/// about the module being written, such as the built kernels and functions.
+///
+/// A vISA kernel or function can call a *subroutine*, which can
+/// then call further subroutines. All called subroutines are considered part of
+/// the kernel or function, which means that a subroutine used by two different
+/// kernels needs to have a copy in each. The two copies may be treated differently
+/// by the backend passes, so there does actually need to be two copies of the
+/// subroutine in the LLVM IR in the backend, one called by each kernel.
+/// 
+/// The GenXModule pass performs any necessary copying of subroutines, and
+/// populates FunctionGroupAnalysis such that each kernel and its subroutines
+/// make one FunctionGroup.
+/// 
+/// Subsequent passes are mostly FunctionGroupPasses, so they process one
+/// FunctionGroup at a time.
+///
+/// GenXModule is also an analysis, preserved through subsequent passes to
+/// GenXVisaWriter at the end, that is used to store each written vISA kernel.
+///
+/// **IR restriction**: After this pass, the lead function in a FunctionGroup is
+/// a kernel (or function in the vISA sense), and other functions in the same
+/// FunctionGroup are its subroutines.  A (non-intrinsic) call must be to a
+/// function in the same FunctionGroup, and not the lead function.
+/// 
+//===----------------------------------------------------------------------===//
+#ifndef GENXMODULE_H
+#define GENXMODULE_H
+
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+
+#include <inc/common/sku_wa.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+class VISABuilder;
+class VISAKernel;
+
+namespace llvm {
+  class raw_pwrite_stream;
+  class GenXSubtarget;
+
+  namespace genx {
+
+    // Stream : a class for streaming byte data, and then writing out to a
+    // formatted_output_stream.
+    class Stream {
+      std::vector<unsigned char> V;
+    public:
+      void push_back(const void *Data, unsigned Size) {
+        unsigned Pos = V.size();
+        V.resize(Pos + Size);
+        std::copy_n((const unsigned char *)Data, Size, V.begin() + Pos);
+      }
+      template<typename T> void push_back(T Val) { push_back(&Val, sizeof(Val)); }
+      unsigned size() { return V.size(); }
+      void write(raw_pwrite_stream &Out);
+      void setData(unsigned Offset, const void *Data, unsigned Size) {
+        assert(Offset + Size <= size());
+        std::copy_n((const unsigned char *)Data, Size, V.begin() + Offset);
+      }
+    };
+
+    // FuncWriter : a class to write the output for a GenX kernel or function
+    class FuncWriter {
+    public:
+      FuncWriter() {}
+      virtual ~FuncWriter() {}
+      // isKernel : true if the Func is a kernel
+      virtual bool isKernel() = 0;
+      // setOffset : set the offset field in the header
+      // For a kernel, it also sets the input_offset field in the header
+      virtual void setOffset(uint32_t O) = 0;
+      // get header/body size
+      virtual unsigned getHeaderSize() = 0;
+      virtual unsigned getBodySize() = 0;
+      // write header/body
+      virtual void writeHeader(raw_pwrite_stream &Out) = 0;
+      virtual void writeBody(raw_pwrite_stream &Out) = 0;
+    };
+
+  } // end namespace genx
+
+
+  //--------------------------------------------------------------------
+  // GenXModule pass. Stores the information from various parts of the
+  // GenX writing process
+  class GenXModule : public ModulePass {
+    typedef std::vector<genx::FuncWriter *> FuncWriters_t;
+    FuncWriters_t FuncWriters;
+    const GenXSubtarget *ST;
+    LLVMContext *Ctx = nullptr;
+    WA_TABLE *WaTable = nullptr;
+
+    void collectFinalizerArgs(std::vector<const char*> &Owner) const;
+    void clearFinalizerArgs(std::vector<const char*>& Owner) const;
+
+    VISABuilder *CisaBuilder = nullptr;
+    std::vector<const char*> CISA_Args;
+    void InitCISABuilder();
+
+    VISABuilder *VISAAsmTextReader = nullptr;
+    std::vector<const char*> VISA_Args;
+    void InitVISAAsmReader();
+
+    bool InlineAsm = false;
+    bool CheckForInlineAsm(Module &M) const;
+
+    std::map<const Function *, VISAKernel *> VisaKernelMap;
+
+  public:
+    static char ID;
+    explicit GenXModule() : ModulePass(ID) {}
+    ~GenXModule() {
+      clearFinalizerArgs(VISA_Args);
+      clearFinalizerArgs(CISA_Args);
+      for (unsigned i = 0; i != FuncWriters.size(); i++)
+        delete FuncWriters[i];
+    }
+    virtual StringRef getPassName() const { return "GenX module"; }
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+    bool runOnModule(Module &M);
+    const GenXSubtarget *getSubtarget() { return ST; }
+    // iterator for FuncWriters list
+    typedef FuncWriters_t::iterator iterator;
+    iterator begin() { return FuncWriters.begin(); }
+    iterator end() { return FuncWriters.end(); }
+    void push_back(genx::FuncWriter *VF) { FuncWriters.push_back(VF); }
+    bool HasInlineAsm() const { return InlineAsm; }
+    VISABuilder *GetCisaBuilder();
+    VISABuilder *GetVISAAsmReader();
+    void DestroyCISABuilder();
+    void DestroyVISAAsmReader();
+    LLVMContext &getContext();
+
+    // Save and retrieve VISAKernels for given function.
+    void saveVisaKernel(const Function *F, VISAKernel *Kernel) {
+      assert(VisaKernelMap.count(F) == 0 && "Attempt to save kernel twice");
+      VisaKernelMap[F] = Kernel;
+    }
+    // Valid only on GenXFinalizer stage until visa builder destructors called.
+    VISAKernel *getVISAKernel(const Function *F) const {
+      return VisaKernelMap.at(F);
+    }
+  };
+
+  void initializeGenXModulePass(PassRegistry &);
+
+} // end namespace llvm
+#endif // ndef GENXMODULE_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.cpp
new file mode 100644
index 000000000000..f9368a06d411
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.cpp
@@ -0,0 +1,392 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// GenXNumbering is an analysis that provides a numbering of the instructions
+// for use by live range segments. See GenXNumbering.h.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_NUMBERING"
+
+#include "GenXNumbering.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXLiveness.h"
+#include "vc/GenXOpts/Utils/KernelInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+
+#include "llvmWrapper/IR/InstrTypes.h"
+
+using namespace llvm;
+using namespace genx;
+
+char GenXNumbering::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXNumbering, "GenXNumbering", "GenXNumbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling)
+INITIALIZE_PASS_END(GenXNumbering, "GenXNumbering", "GenXNumbering", false, false)
+
+FunctionGroupPass *llvm::createGenXNumberingPass()
+{
+  initializeGenXNumberingPass(*PassRegistry::getPassRegistry());
+  return new GenXNumbering();
+}
+
+void GenXNumbering::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  FunctionGroupPass::getAnalysisUsage(AU);
+  AU.addRequired<GenXGroupBaling>();
+  AU.setPreservesAll();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run pass
+ */
+bool GenXNumbering::runOnFunctionGroup(FunctionGroup &ArgFG)
+{
+  clear();
+  FG = &ArgFG;
+  Baling = &getAnalysis<GenXGroupBaling>();
+  unsigned Num = 0;
+  for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi)
+    Num = numberInstructionsInFunc(*fgi, Num);
+  LastNum = Num;
+  return false;
+}
+
+/***********************************************************************
+ * clear : clear the GenXNumbering
+ */
+void GenXNumbering::clear()
+{
+  BBNumbers.clear();
+  Numbers.clear();
+  NumberToPhiIncomingMap.clear();
+}
+
+/***********************************************************************
+ * numberInstructionsInFunc : number the instructions in a function
+ */
+unsigned GenXNumbering::numberInstructionsInFunc(Function *Func, unsigned Num)
+{
+  // Number the function, reserving one number for the args.
+  Numbers[Func] = Num++;
+  for (Function::iterator fi = Func->begin(), fe = Func->end(); fi != fe; ++fi) {
+    BasicBlock *Block = &*fi;
+    // Number the basic block.
+    auto BBNumber = &BBNumbers[Block];
+    BBNumber->Index = BBNumbers.size() - 1;
+    Numbers[Block] = Num++;
+    // If this is the first block of a kernel, reserve kernel arg copy slots.
+    if (Block == &Func->front() && isKernel(Func))
+      for (auto ai = Func->arg_begin(), ae = Func->arg_end(); ai != ae; ++ai)
+        ++Num;
+    // Iterate the instructions.
+    Instruction *Inst;
+    for (BasicBlock::iterator bi = Block->begin(); ; ++bi) {
+      Inst = &*bi;
+      if (Inst->isTerminator())
+        break;
+      // For most instructions, reserve one number for any pre-copy that
+      // coalescing needs to insert, and nothing after.
+      unsigned PreReserve = 1, PostReserve = 0;
+      if (auto CI = dyn_cast<CallInst>(Inst)) {
+        if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI) &&
+            !CI->isInlineAsm()) {
+          // For a non-intrinsic call, reserve enough numbers before the call
+          // for:
+          //  - a slot for each element of the args, two numbers per element:
+          //    1. one for the address setup in case it is an address arg added
+          //       by arg indirection (as returned by getArgIndirectionNumber());
+          //    2. one for a pre-copy inserted if coalescing fails (as returned
+          //       by getArgPreCopyNumber());
+          //
+          //  - a similar slot with two numbers for any address arg added by
+          //    arg indirection (also as returned by getArgIndirectionNumber()
+          //    and getArgPreCopyNumber()).
+          //
+          // Reserve enough numbers after the call for:
+          //  -  post-copies of (elements of) the return value, as returned by
+          //     getRetPostCopyNumber().
+          //
+          // Note that numbers get wasted because most call args do not need
+          // two slots, and most calls never have address args added by arg
+          // indirection. But treating all call args the same is easier, and
+          // wasting numbers does not really matter.
+          PreReserve = 2 * IndexFlattener::getNumArgElements(
+                CI->getFunctionType());
+          PreReserve += 2 * CI->getNumArgOperands(); // extra for pre-copy addresses of args
+          unsigned NumRetVals = IndexFlattener::getNumElements(CI->getType());
+          PreReserve += NumRetVals; // extra for pre-copy addresses of retvals
+          PostReserve = NumRetVals;
+          // Set the start number of the call so users of numbering can work out
+          // where the pre-copies are assumed to start, even if the call gets
+          // modified later by GenXArgIndirection.
+          setStartNumber(CI, Num);
+        }
+      }
+      // Number the instruction, reserving PreReserve.
+      Num += PreReserve;
+      Numbers[Inst] = Num;
+      Num += 1 + PostReserve;
+    }
+    // We have reached the terminator instruction but not yet numbered it.
+    // Reserve a number for each phi node in the successor. If there is
+    // more than one successor (this is a critical edge), then allow for
+    // whichever successor has the most phi nodes.
+    BBNumber->PhiNumber = Num;
+    auto TI = cast<IGCLLVM::TerminatorInst>(Block->getTerminator());
+    unsigned MaxPhis = 0;
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *Succ = TI->getSuccessor(i);
+      unsigned NumPhis = 0;
+      for (BasicBlock::iterator sbi = Succ->begin(), sbe = Succ->end(); sbi != sbe; ++sbi) {
+        if (!isa<PHINode>(&*sbi))
+          break;
+        NumPhis++;
+      }
+      if (NumPhis > MaxPhis)
+        MaxPhis = NumPhis;
+    }
+    Num += MaxPhis;
+    // Now number the terminator instruction. Doing it here ensures that any
+    // input to the terminator instruction interferes with the results of the
+    // phi nodes of the successor.
+    unsigned PreReserve = 1;
+    if (isa<ReturnInst>(Inst)) {
+      // For a return, reserve enough numbers before for pre-copies of
+      // (elements of) the return value.
+      PreReserve = IndexFlattener::getNumElements(Func->getReturnType());
+    }
+    Num += PreReserve;
+    Numbers[Inst] = Num++;
+    BBNumber->EndNumber = Num;
+  }
+  return Num;
+}
+
+/***********************************************************************
+ * getBaleNumber : get instruction number for head of bale, 0 if none
+ */
+unsigned GenXNumbering::getBaleNumber(Instruction *Inst)
+{
+  Inst = Baling->getBaleHead(Inst);
+  return getNumber(Inst);
+}
+
+/***********************************************************************
+ * getNumber : get instruction number, or 0 if none
+ */
+unsigned GenXNumbering::getNumber(Value *V)
+{
+  auto i = Numbers.find(V), e = Numbers.end();
+  if (i == e)
+    return 0;
+  return i->second;
+}
+
+/***********************************************************************
+ * setNumber : get instruction number
+ */
+void GenXNumbering::setNumber(Value *V, unsigned Number)
+{
+  Numbers[V] = Number;
+}
+
+/***********************************************************************
+ * getArgIndirectionNumber : get number of arg indirection slot for call arg
+ *
+ * Enter:   CI = CallInst
+ *          OperandNum = operand (arg) number
+ *          Index = flattened index in the struct
+ *
+ * Each flattened index in each call arg has an arg indirection slot before the
+ * call instruction, where a copy will be inserted if coalescing fails. Each
+ * slot in fact has two numbers, and this returns the first one. (The second
+ * one is used for arg pre-copy when coalescing fails.)
+ */
+unsigned GenXNumbering::getArgIndirectionNumber(CallInst *CI, unsigned OperandNum,
+    unsigned Index)
+{
+  auto FT = cast<FunctionType>(CI->getFunctionType());
+  return getStartNumber(CI) + 2 * (IndexFlattener::flattenArg(FT, OperandNum)
+        + Index);
+}
+
+/***********************************************************************
+ * getKernelArgCopyNumber : get number of kernel arg copy slot
+ */
+unsigned GenXNumbering::getKernelArgCopyNumber(Argument *Arg)
+{
+  assert(isKernel(Arg->getParent()));
+  return Numbers[&Arg->getParent()->front()] + 1 + Arg->getArgNo();
+}
+
+/***********************************************************************
+ * getArgPreCopyNumber : get number of pre-copy slot for call arg
+ *
+ * Enter:   CI = CallInst
+ *          OperandNum = operand (arg) number
+ *          Index = flattened index in the struct
+ *
+ * Each flattened index in each call arg has an arg pre-copy slot before the
+ * call instruction, where a copy will be inserted if coalescing fails. Each
+ * slot in fact has two numbers, and this returns the second one. (The first
+ * one is used for address loading in arg indirection.)
+ */
+unsigned GenXNumbering::getArgPreCopyNumber(CallInst *CI, unsigned OperandNum,
+    unsigned Index)
+{
+  return getArgIndirectionNumber(CI, OperandNum, Index) + 1;
+}
+
+/***********************************************************************
+ * getRetPreCopyNumber : get number of pre-copy slot for return value
+ *
+ * Enter:   RI = ReturnInst
+ *          Index = flattened index in the struct
+ *
+ * For each flattened index in the return type, there is one slot before the
+ * return instruction.
+ */
+unsigned GenXNumbering::getRetPreCopyNumber(ReturnInst *RI, unsigned Index)
+{
+  return getNumber(RI)
+      - IndexFlattener::getNumElements(RI->getOperand(0)->getType()) + Index;
+}
+
+/***********************************************************************
+ * getRetPostCopyNumber : get number of post-copy slot for return value
+ *
+ * Enter:   CI = CallInst
+ *          Index = flattened index in the struct
+ *
+ * For each flattened index in the return type, there is one slot after the call
+ * instruction.
+ */
+unsigned GenXNumbering::getRetPostCopyNumber(CallInst *CI, unsigned Index)
+{
+  return getNumber(CI) + 1 + Index;
+}
+
+/***********************************************************************
+ * getPhiNumber : get instruction number for phi node for particular predecessor
+ *
+ * The non-const version caches the result in NumberToPhiIncomingMap, for the
+ * later use of getPhiIncomingFromNumber.
+ */
+unsigned GenXNumbering::getPhiNumber(PHINode *Phi, BasicBlock *BB) const
+{
+  // The instruction number is the count of phi nodes before it added to the
+  // PhiNumber for the predecessor.
+  return BBNumbers.find(BB)->second.PhiNumber + getPhiOffset(Phi);
+}
+
+unsigned GenXNumbering::getPhiNumber(PHINode *Phi, BasicBlock *BB)
+{
+  unsigned Number = ((const GenXNumbering *)this)->getPhiNumber(Phi, BB);
+  NumberToPhiIncomingMap[Number]
+      = std::pair<PHINode *, unsigned>(Phi, Phi->getBasicBlockIndex(BB));
+  return Number;
+}
+
+/***********************************************************************
+ * getPhiIncomingFromNumber : get the phi incoming for a number returned from
+ *    getPhiNumber
+ *
+ * This returns the phi node and incoming index corresponding to the supplied
+ * instruction number.
+ */
+std::pair<PHINode *, unsigned> GenXNumbering::getPhiIncomingFromNumber(
+    unsigned Number)
+{
+  auto i = NumberToPhiIncomingMap.find(Number);
+  if (i == NumberToPhiIncomingMap.end())
+    return std::pair<PHINode *, unsigned>(nullptr, 0);
+  return i->second;
+}
+
+/***********************************************************************
+ * getPhiOffset : get phi node offset (the 0 based index within its block)
+ */
+unsigned GenXNumbering::getPhiOffset(PHINode *Phi) const
+{
+  // Count phi nodes from start of basic block to here.
+  unsigned Count = 0;
+  for (BasicBlock::const_iterator bi = Phi->getParent()->begin(); &*bi != Phi; ++bi)
+    ++Count;
+  return Count;
+}
+
+/***********************************************************************
+ * dump, print : dump the instruction numbering
+ */
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void GenXNumbering::dump()
+{
+  print(errs()); errs() << '\n';
+}
+#endif
+
+void GenXNumbering::print(raw_ostream &OS) const
+{
+  OS << "GenXNumbering for FunctionGroup " << FG->getName() << "\n";
+  for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) {
+    Function *Func = *fgi;
+    if (FG->size() != 1)
+      OS << Func->getName() << ":\n";
+    for (Function::iterator fi = Func->begin(), fe = Func->end(); fi != fe; ++fi) {
+      BasicBlock *BB = &*fi;
+      OS << "\n" << Numbers.find(BB)->second << " " << BB->getName() << ":\n";
+      for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) {
+        Instruction *Inst = &*bi;
+        if (Numbers.find(Inst) == Numbers.end())
+          OS << " - ";
+        else
+          OS << Numbers.find(Inst)->second;
+        OS << "   ";
+        Inst->print(OS);
+        OS << "\n";
+      }
+      auto TI = cast<IGCLLVM::TerminatorInst>(BB->getTerminator());
+      if (TI->getNumSuccessors()) {
+        BasicBlock *Succ = TI->getSuccessor(0);
+        for (BasicBlock::iterator sbi = Succ->begin(), sbe = Succ->end(); sbi != sbe; ++sbi) {
+          if (PHINode *Phi = dyn_cast<PHINode>(&*sbi)) {
+            OS << "(" << getPhiNumber(Phi, BB) << ")  ";
+            Phi->print(OS);
+            OS << "\n";
+          } else
+            break;
+        }
+      }
+    }
+  }
+  OS << "\n";
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.h
new file mode 100644
index 000000000000..dd92bbaaf24e
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.h
@@ -0,0 +1,166 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXNumbering
+/// -------------
+///
+/// GenXNumbering is an analysis that provides a numbering of the instructions
+/// for use by live ranges.
+///
+/// The numbering is done such that slots are reserved for where GenXCoalescing
+/// might need to insert copies.
+///
+/// Generally, an instruction gets a slot in the numbering for itself, and
+/// another slot just before, in case it is a two address instruction where
+/// GenXCoalescing might want to insert a copy.
+///
+/// Every instruction gets a number, even if it is baled in. However, for the
+/// purposes of live range segments, every instruction in a bale is assumed
+/// to have the same number as the head instruction of the bale.
+///
+/// A non-intrinsic call has N slots reserved
+/// before it for pre-copies, where N is the number of SimpleValues in the
+/// (possibly struct) args, allowing for extra args that might be added later by
+/// GenXArgIndirection.
+/// 
+/// Similarly, a non-intrinsic call has N slots reserved after it for
+/// post-copies, where N is the number of SimpleValues in the (possibly struct)
+/// return value. The definition of each SimpleValue in the result of the call
+/// is considered to be in its slot, and the corresponding SimpleValue in the
+/// unified return value has an extra segment of live range from the call up to
+/// that slot.
+/// 
+/// A return instruction in a subroutine has N slots reserved before it for
+/// pre-copies, where N is the number of SimpleValues in the (possibly struct)
+/// return value. The use of each SimpleValue in the return is considered to be
+/// in its slot, and the corresponding SimpleValue in the unified return value
+/// has an extra segment of live range from the slot up to the return.
+///
+/// A kernel has a slot for each kernel arg copy. A copy is inserted into such a slot in
+/// GenXCoalescing if the kernel arg offset is not aligned enough for the uses
+/// of the value.
+///
+/// **IR restriction**: After this pass, it is very difficult to modify code
+/// other than by inserting copies in the reserved slots above, as it would
+/// disturb the numbering.
+///
+//===----------------------------------------------------------------------===//
+#ifndef GENXNUMBERING_H
+#define GENXNUMBERING_H
+
+#include "FunctionGroup.h"
+#include "IgnoreRAUWValueMap.h"
+#include "llvm/IR/Value.h"
+
+namespace llvm {
+
+class CallInst;
+class GenXBaling;
+class PHINode;
+class ReturnInst;
+
+FunctionGroupPass *createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner);
+
+class GenXNumbering : public FunctionGroupPass {
+  FunctionGroup *FG;
+  GenXBaling *Baling;
+  struct BBNumber {
+    unsigned Index; // 0-based index in list of basic blocks
+    unsigned PhiNumber; // instruction number of first phi node in successor
+    unsigned EndNumber; // instruction number of end of block
+  };
+  // BBNumbers : The 0-based number (index) of each basic block.
+  ValueMap<const BasicBlock *, BBNumber,
+          IgnoreRAUWValueMapConfig<const BasicBlock *>> BBNumbers;
+  // Numbers : The map of instruction numbers.
+  ValueMap<const Value *, unsigned,
+          IgnoreRAUWValueMapConfig<const Value *>> Numbers;
+  // StartNumbers : for a CallInst, the start number of where arg pre-copies
+  // are considered to be. This is stored, instead of being calculated from
+  // the CallInst's number, so that a CallInst can change number of args, as
+  // happens in GenXArgIndirection.
+  ValueMap<const Value *, unsigned,
+          IgnoreRAUWValueMapConfig<const Value *>> StartNumbers;
+  // NumberToPhiIncomingMap : map from instruction number to the phi incoming (phi
+  //  node plus incoming index) it represents. We assume that a phi node is
+  //  never deleted after GenXNumbering.
+  std::map<unsigned, std::pair<PHINode *, unsigned>> NumberToPhiIncomingMap;
+
+  // The number for the entire fucntion group. All live ranges are included in
+  // live-range [0, LastNum].
+  unsigned LastNum = 0;
+
+public:
+  static char ID;
+  explicit GenXNumbering() : FunctionGroupPass(ID), Baling(0) { }
+  ~GenXNumbering() { clear(); }
+  virtual StringRef getPassName() const { return "GenX numbering"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunctionGroup(FunctionGroup &FG);
+  // get BBNumber struct for a basic block
+  const BBNumber *getBBNumber(BasicBlock *BB) { return &BBNumbers[BB]; }
+  // get and set instruction number
+  unsigned getBaleNumber(Instruction *Inst);
+  unsigned getNumber(Value *V);
+  unsigned getLastNumber() const { return LastNum; }
+  void setNumber(Value *V, unsigned Number);
+  // get and set "start instruction number" for a CallInst
+  unsigned getStartNumber(Value *V) { return StartNumbers[V]; }
+  void setStartNumber(Value *V, unsigned Number) { StartNumbers[V] = Number; }
+  // get number for kernel arg copy, arg pre-copy, ret pre-copy and ret post-copy sites
+  unsigned getArgIndirectionNumber(CallInst *CI, unsigned OperandNum, unsigned Index);
+  unsigned getKernelArgCopyNumber(Argument *Arg);
+  unsigned getArgPreCopyNumber(CallInst *CI, unsigned OperandNum, unsigned Index);
+  unsigned getRetPreCopyNumber(ReturnInst *RI, unsigned Index);
+  unsigned getRetPostCopyNumber(CallInst *CI, unsigned Index);
+  // get the number of a phi incoming, where its copy will be inserted
+  // if necessary
+  unsigned getPhiNumber(PHINode *Phi, BasicBlock *BB) const;
+  unsigned getPhiNumber(PHINode *Phi, BasicBlock *BB);
+  // getPhiIncomingFromNumber : get the phi incoming for a number returned from getPhiNumber
+  std::pair<PHINode *, unsigned> getPhiIncomingFromNumber(unsigned Number);
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O,
+                                  const std::string &Banner) const {
+    return createGenXGroupPrinterPass(O, Banner);
+  }
+  // Debug dump
+  void dump();
+  using llvm::Pass::print; // enables overloading of print in this class rather
+                           // than override (and stops compiler warnings)
+  virtual void print(raw_ostream &OS) const;
+
+private:
+  void clear();
+  unsigned numberInstructionsInFunc(Function *Func, unsigned Num);
+  unsigned getPhiOffset(PHINode *Phi) const;
+};
+
+void initializeGenXNumberingPass(PassRegistry &);
+
+} // end namespace llvm
+#endif //ndef GENXNUMBERING_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLInfoExtractor.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLInfoExtractor.cpp
new file mode 100644
index 000000000000..446409cf99ee
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLInfoExtractor.cpp
@@ -0,0 +1,77 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "GenX.h"
+#include "GenXOCLRuntimeInfo.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeGenXOCLInfoExtractorPass(PassRegistry &PR);
+}
+
+class GenXOCLInfoExtractor : public ModulePass {
+public:
+  static char ID;
+
+private:
+  std::vector<GenXOCLRuntimeInfo::CompiledKernel> *Dest = nullptr;
+
+public:
+  StringRef getPassName() const override { return "GenX OCL Info Extractor"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<GenXOCLRuntimeInfo>();
+  }
+
+  GenXOCLInfoExtractor() : ModulePass(ID) {}
+
+  GenXOCLInfoExtractor(std::vector<GenXOCLRuntimeInfo::CompiledKernel> &Dst)
+      : ModulePass(ID), Dest(&Dst) {
+    initializeGenXOCLInfoExtractorPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    assert(Dest && "Expected dest to be initialized");
+    auto &Info = getAnalysis<GenXOCLRuntimeInfo>();
+    *Dest = Info.stealCompiledKernels();
+    return false;
+  }
+};
+
+char GenXOCLInfoExtractor::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GenXOCLInfoExtractor, "GenXOCLInfoExtractor",
+                      "GenXOCLInfoExtractor", false, false)
+INITIALIZE_PASS_DEPENDENCY(GenXOCLRuntimeInfo)
+INITIALIZE_PASS_END(GenXOCLInfoExtractor, "GenXOCLInfoExtractor",
+                    "GenXOCLInfoExtractor", false, false)
+
+ModulePass *llvm::createGenXOCLInfoExtractorPass(
+    std::vector<GenXOCLRuntimeInfo::CompiledKernel> &Dest) {
+  return new GenXOCLInfoExtractor(Dest);
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp
new file mode 100644
index 000000000000..a0f9990d1461
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp
@@ -0,0 +1,292 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "GenXOCLRuntimeInfo.h"
+#include "GenX.h"
+#include "GenXSubtarget.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/DataLayout.h"
+
+#include <cctype>
+#include <functional>
+#include <iterator>
+
+using namespace llvm;
+
+char GenXOCLRuntimeInfo::ID = 0;
+
+//===----------------------------------------------------------------------===//
+//
+// Kernel info implementation.
+//
+//===----------------------------------------------------------------------===//
+// Just perform linear instructions scan to find usage stats.
+// Intrinsic set copied from igcmc.
+void GenXOCLRuntimeInfo::KernelInfo::setInstructionUsageProperties(
+    FunctionGroup &FG, const GenXSubtarget &ST) {
+  for (Function *F : FG) {
+    for (BasicBlock &BB : *F) {
+      for (Instruction &I : BB) {
+        switch (GenXIntrinsic::getGenXIntrinsicID(&I)) {
+        default:
+          break;
+        case GenXIntrinsic::genx_group_id_x:
+        case GenXIntrinsic::genx_group_id_y:
+        case GenXIntrinsic::genx_group_id_z:
+          UsesGroupId = true;
+          break;
+        case GenXIntrinsic::genx_barrier:
+          UsesBarriers = true;
+          break;
+        case GenXIntrinsic::genx_ssdp4a:
+        case GenXIntrinsic::genx_sudp4a:
+        case GenXIntrinsic::genx_usdp4a:
+        case GenXIntrinsic::genx_uudp4a:
+        case GenXIntrinsic::genx_ssdp4a_sat:
+        case GenXIntrinsic::genx_sudp4a_sat:
+        case GenXIntrinsic::genx_usdp4a_sat:
+        case GenXIntrinsic::genx_uudp4a_sat:
+          break;
+        case GenXIntrinsic::genx_alloca:
+          ThreadPrivateMemSize = ST.stackSurfaceMaxSize();
+          break;
+        }
+      }
+    }
+  }
+}
+
+void GenXOCLRuntimeInfo::KernelInfo::setMetadataProperties(
+    genx::KernelMetadata &KM, const GenXSubtarget &ST) {
+  Name = KM.getName();
+  SLMSize = KM.getSLMSize();
+  // will be replaced to metadata usage once
+  // useGlobalMem option is removed from GenXSubtarget
+  // FIXME: replace with 8k * simdSize * numDispatchedThreads
+  if (ST.useGlobalMem())
+    StatelessPrivateMemSize = 16 * 8192;
+
+}
+
+void GenXOCLRuntimeInfo::KernelInfo::setArgumentProperties(
+    const Function &Kernel, genx::KernelMetadata &KM) {
+  assert(Kernel.arg_size() == KM.getNumArgs() &&
+         "Expected same number of arguments");
+  // Some arguments are part of thread payload and do not require
+  // entries in arguments info for OCL runtime.
+  auto NonPayloadArgs =
+      make_filter_range(Kernel.args(), [&KM](const Argument &Arg) {
+        uint32_t ArgKind = KM.getArgKind(Arg.getArgNo());
+        genx::KernelArgInfo KAI(ArgKind);
+        return !(KAI.isLocalIDX() || KAI.isLocalIDY() || KAI.isLocalIDZ() ||
+                 KAI.isGroupOrLocalSize() || KAI.isLocalIDs());
+      });
+  const DataLayout &DL = Kernel.getParent()->getDataLayout();
+  transform(NonPayloadArgs, std::back_inserter(ArgInfos),
+            [&KM, &DL](const Argument &Arg) {
+              return KernelArgInfo{Arg, KM, DL};
+            });
+  UsesReadWriteImages = std::any_of(
+      ArgInfos.begin(), ArgInfos.end(), [](const KernelArgInfo &AI) {
+        return AI.isImage() &&
+               AI.getAccessKind() == KernelArgInfo::AccessKindType::ReadWrite;
+      });
+}
+
+void GenXOCLRuntimeInfo::KernelInfo::setPrintStrings(
+    const Module &KernelModule) {
+  const auto *StringsMeta = KernelModule.getNamedMetadata("cm_print_strings");
+  if (!StringsMeta)
+    return;
+  std::transform(StringsMeta->op_begin(), StringsMeta->op_end(),
+                 std::back_inserter(PrintStrings), [](const auto *StringMeta) {
+                   StringRef Str =
+                       cast<MDString>(StringMeta->getOperand(0))->getString();
+                   return std::string{Str.begin(), Str.end()};
+                 });
+}
+
+GenXOCLRuntimeInfo::KernelInfo::KernelInfo(FunctionGroup &FG,
+                                           const GenXSubtarget &ST) {
+  setInstructionUsageProperties(FG, ST);
+
+  GRFSizeInBytes = ST.getGRFWidth();
+
+  genx::KernelMetadata KM{FG.getHead()};
+  assert(KM.isKernel() && "Expected kernel as head of function group");
+  setMetadataProperties(KM, ST);
+  setArgumentProperties(*FG.getHead(), KM);
+  setPrintStrings(*FG.getHead()->getParent());
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Kernel argument info implementation.
+//
+//===----------------------------------------------------------------------===//
+// Supported kernel argument attributes.
+// Copied from igcmc.h.
+struct OCLAttributes {
+  static constexpr auto ReadOnly =
+      "read_only"; // This resource is for read only.
+  static constexpr auto WriteOnly =
+      "write_only"; // This resource is for write only.
+  static constexpr auto ReadWrite =
+      "read_write"; // This resource is for read and write.
+  static constexpr auto Buffer = "buffer_t";   // This resource is a buffer.
+  static constexpr auto SVM = "svmptr_t";      // This resource is a SVM buffer.
+  static constexpr auto Sampler = "sampler_t"; // This resource is a sampler.
+  static constexpr auto Image1d = "image1d_t"; // This resource is a 1D surface.
+  static constexpr auto Image1d_buffer = "image1d_buffer_t"; // This resource is a 1D surface.
+  static constexpr auto Image2d = "image2d_t"; // This resource is a 2D surface.
+  static constexpr auto Image3d = "image3d_t"; // This resource is a 3D surface.
+};
+
+using ArgKindType = GenXOCLRuntimeInfo::KernelArgInfo::KindType;
+
+static auto GetStrPred = [](const char *Attr) {
+  return [Attr](StringRef Token) { return Token == Attr; };
+};
+
+static ArgKindType getOCLArgKind(const SmallVectorImpl<StringRef> &Tokens,
+                                 unsigned ArgNo, genx::KernelMetadata &KM) {
+  unsigned RawKind = KM.getArgKind(ArgNo);
+
+  // Implicit arguments.
+  genx::KernelArgInfo KAI{RawKind};
+  if (KAI.isLocalSize())
+    return ArgKindType::LocalSize;
+  if (KAI.isGroupCount())
+    return ArgKindType::GroupCount;
+  if (KAI.isPrintBuffer())
+    return ArgKindType::PrintBuffer;
+  if (KAI.isPrivateBase())
+    return ArgKindType::PrivateBase;
+
+  // Explicit arguments.
+  switch (KM.getArgCategory(ArgNo)) {
+  default:
+    return ArgKindType::General;
+  case genx::RegCategory::GENERAL:
+    if (any_of(Tokens, GetStrPred(OCLAttributes::SVM)))
+      return ArgKindType::SVM;
+    return ArgKindType::General;
+  case genx::RegCategory::SURFACE:
+    if (any_of(Tokens, GetStrPred(OCLAttributes::Image1d)))
+      return ArgKindType::Image1D;
+    if (any_of(Tokens, GetStrPred(OCLAttributes::Image1d_buffer)))
+      return ArgKindType::Image1D;
+    if (any_of(Tokens, GetStrPred(OCLAttributes::Image2d)))
+      return ArgKindType::Image2D;
+    if (any_of(Tokens, GetStrPred(OCLAttributes::Image3d)))
+      return ArgKindType::Image3D;
+    return ArgKindType::Buffer;
+  case genx::RegCategory::SAMPLER:
+    return ArgKindType::Sampler;
+  }
+}
+
+using ArgAccessKindType = GenXOCLRuntimeInfo::KernelArgInfo::AccessKindType;
+
+static ArgAccessKindType
+getOCLArgAccessKind(const SmallVectorImpl<StringRef> &Tokens,
+                    ArgKindType Kind) {
+  // As in igcmc.cpp.
+  switch (Kind) {
+  case ArgKindType::Buffer:
+  case ArgKindType::Image1D:
+  case ArgKindType::Image2D:
+  case ArgKindType::Image3D:
+  case ArgKindType::SVM:
+    if (any_of(Tokens, GetStrPred(OCLAttributes::ReadOnly)))
+      return ArgAccessKindType::ReadOnly;
+    if (any_of(Tokens, GetStrPred(OCLAttributes::WriteOnly)))
+      return ArgAccessKindType::WriteOnly;
+    return ArgAccessKindType::ReadWrite;
+  default:
+    return ArgAccessKindType::None;
+  }
+}
+
+// Initialize Kind and AccessKind from given ArgTypeDesc in metadata.
+void GenXOCLRuntimeInfo::KernelArgInfo::translateArgDesc(
+    genx::KernelMetadata &KM) {
+  std::string Translated{KM.getArgTypeDesc(Index)};
+  // Transform each separator to space.
+  std::transform(Translated.begin(), Translated.end(), Translated.begin(),
+                 [](char C) {
+                   if (C != '-' && C != '_' && C != '=' && !std::isalnum(C))
+                     return ' ';
+                   return C;
+                 });
+
+  // Split and delete duplicates.
+  SmallVector<StringRef, 4> Tokens;
+  StringRef(Translated)
+      .split(Tokens, ' ', -1 /* MaxSplit */, false /* AllowEmpty */);
+  std::sort(Tokens.begin(), Tokens.end());
+  Tokens.erase(std::unique(Tokens.begin(), Tokens.end()), Tokens.end());
+
+  Kind = getOCLArgKind(Tokens, Index, KM);
+  AccessKind = getOCLArgAccessKind(Tokens, Kind);
+}
+
+static unsigned getArgSizeInBytes(const Argument &Arg, genx::KernelMetadata &KM,
+                                  const DataLayout &DL) {
+  Type *ArgTy = Arg.getType();
+  if (ArgTy->isPointerTy())
+    return DL.getPointerTypeSize(ArgTy);
+  if (KM.isBufferType(Arg.getArgNo()))
+    return DL.getPointerSize();
+  return ArgTy->getPrimitiveSizeInBits() / genx::ByteBits;
+}
+
+GenXOCLRuntimeInfo::KernelArgInfo::KernelArgInfo(const Argument &Arg,
+                                                 genx::KernelMetadata &KM,
+                                                 const DataLayout &DL)
+    : Index(Arg.getArgNo()) {
+  translateArgDesc(KM);
+  Offset = KM.getArgOffset(Index);
+  SizeInBytes = getArgSizeInBytes(Arg, KM, DL);
+  BTI = KM.getBTI(Index);
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Compiled kernel implementation.
+//
+//===----------------------------------------------------------------------===//
+GenXOCLRuntimeInfo::CompiledKernel::CompiledKernel(KernelInfo &&KI,
+                                                   const FINALIZER_INFO &JI,
+                                                   ArrayRef<char> GenBin)
+    : CompilerInfo(std::move(KI)), JitterInfo(JI),
+      GenBinary(GenBin.begin(), GenBin.end()) {}
+
+INITIALIZE_PASS_BEGIN(GenXOCLRuntimeInfo, "GenXOCLRuntimeInfo",
+                      "GenXOCLRuntimeInfo", false, true)
+INITIALIZE_PASS_END(GenXOCLRuntimeInfo, "GenXOCLRuntimeInfo",
+                    "GenXOCLRuntimeInfo", false, true)
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.h
new file mode 100644
index 000000000000..438a6e3fea02
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.h
@@ -0,0 +1,256 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef VCOPT_LIB_GENXCODEGEN_GENXOCLRUNTIMEINFO_H
+#define VCOPT_LIB_GENXCODEGEN_GENXOCLRUNTIMEINFO_H
+
+#include "FunctionGroup.h"
+#include "JitterDataStruct.h"
+#include "vc/GenXOpts/Utils/KernelInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Pass.h"
+#include <cassert>
+#include <map>
+
+namespace llvm {
+class Function;
+class GenXSubtarget;
+
+void initializeGenXOCLRuntimeInfoPass(PassRegistry &PR);
+
+// This is an immutable pass to allow it creation once in the beginning of
+// pipeline since creating it before actual place of need (cisa builder)
+// will invalidate every other analyses required by builder.
+class GenXOCLRuntimeInfo : public ImmutablePass {
+public:
+  class KernelArgInfo {
+  public:
+    enum class KindType {
+      General,
+      LocalSize,
+      GroupCount,
+      Buffer,
+      SVM,
+      Sampler,
+      Image1D,
+      Image2D,
+      Image3D,
+      PrintBuffer,
+      PrivateBase
+    };
+
+    enum class AccessKindType { None, ReadOnly, WriteOnly, ReadWrite };
+
+  private:
+    unsigned Index;
+    KindType Kind;
+    AccessKindType AccessKind;
+    unsigned Offset;
+    unsigned SizeInBytes;
+    unsigned BTI;
+
+  private:
+    void translateArgDesc(genx::KernelMetadata &KM);
+
+  public:
+    KernelArgInfo(const Argument &Arg, genx::KernelMetadata &KM,
+                  const DataLayout &DL);
+
+    unsigned getIndex() const { return Index; }
+    KindType getKind() const { return Kind; }
+    AccessKindType getAccessKind() const { return AccessKind; }
+    unsigned getOffset() const { return Offset; }
+    unsigned getSizeInBytes() const { return SizeInBytes; }
+    unsigned getBTI() const { return BTI; }
+
+    bool isImage() const {
+      switch (Kind) {
+      case KindType::Image1D:
+      case KindType::Image2D:
+      case KindType::Image3D:
+        return true;
+      default:
+        return false;
+      }
+    }
+  };
+
+  struct TableInfo {
+    void *Buffer = nullptr;
+    unsigned Size = 0;
+    unsigned Entries = 0;
+  };
+
+  // Additional kernel info that are not provided by finalizer
+  // but still required for runtime.
+  struct KernelInfo {
+  private:
+    std::string Name;
+
+    bool UsesGroupId = false;
+
+
+    // Jitter info contains similar field.
+    // Whom should we believe?
+    bool UsesBarriers = false;
+
+    bool UsesReadWriteImages = false;
+
+    unsigned SLMSize = 0;
+    unsigned ThreadPrivateMemSize = 0;
+    unsigned StatelessPrivateMemSize = 0;
+
+    unsigned GRFSizeInBytes;
+
+    using ArgInfoStorageTy = std::vector<KernelArgInfo>;
+    using PrintStringStorageTy = std::vector<std::string>;
+    ArgInfoStorageTy ArgInfos;
+    PrintStringStorageTy PrintStrings;
+
+    TableInfo ReloTable;
+    TableInfo SymbolTable;
+
+  private:
+    void setInstructionUsageProperties(FunctionGroup &FG,
+                                       const GenXSubtarget &ST);
+    void setMetadataProperties(genx::KernelMetadata &KM,
+                               const GenXSubtarget &ST);
+    void setArgumentProperties(const Function &Kernel,
+                               genx::KernelMetadata &KM);
+    void setPrintStrings(const Module &KernelModule);
+
+  public:
+    using arg_iterator = ArgInfoStorageTy::iterator;
+    using arg_const_iterator = ArgInfoStorageTy::const_iterator;
+    using arg_size_type = ArgInfoStorageTy::size_type;
+
+  public:
+    // Creates kernel info for given function group.
+    KernelInfo(FunctionGroup &FG, const GenXSubtarget &ST);
+
+    const std::string &getName() const { return Name; }
+
+    // These are considered to always be true (at least in igcmc).
+    // Preserve this here.
+    bool usesLocalIdX() const { return true; }
+    bool usesLocalIdY() const { return true; }
+    bool usesLocalIdZ() const { return true; }
+
+    // Deduced from actual function instructions.
+    bool usesGroupId() const { return UsesGroupId; }
+
+    // SIMD size is always set by igcmc to one. Preserve this here.
+    unsigned getSIMDSize() const { return 1; }
+    unsigned getSLMSize() const { return SLMSize; }
+
+    // Deduced from actual function instructions.
+    unsigned getTPMSize() const { return ThreadPrivateMemSize; }
+    unsigned getStatelessPrivMemSize() const { return StatelessPrivateMemSize; }
+
+    unsigned getGRFSizeInBytes() const { return GRFSizeInBytes; }
+
+
+    bool usesBarriers() const { return UsesBarriers; }
+    bool usesReadWriteImages() const { return UsesReadWriteImages; }
+
+    // Arguments accessors.
+    arg_iterator arg_begin() { return ArgInfos.begin(); }
+    arg_iterator arg_end() { return ArgInfos.end(); }
+    arg_const_iterator arg_begin() const { return ArgInfos.begin(); }
+    arg_const_iterator arg_end() const { return ArgInfos.end(); }
+    iterator_range<arg_iterator> args() { return {arg_begin(), arg_end()}; }
+    iterator_range<arg_const_iterator> args() const {
+      return {arg_begin(), arg_end()};
+    }
+    arg_size_type arg_size() const { return ArgInfos.size(); }
+    bool arg_empty() const { return ArgInfos.empty(); }
+    const PrintStringStorageTy &getPrintStrings() const { return PrintStrings; }
+    TableInfo &getRelocationTable() { return ReloTable; }
+    const TableInfo &getRelocationTable() const { return ReloTable; }
+    TableInfo &getSymbolTable() { return SymbolTable; }
+    const TableInfo &getSymbolTable() const { return SymbolTable; }
+  };
+
+
+  class CompiledKernel {
+    KernelInfo CompilerInfo;
+    FINALIZER_INFO JitterInfo;
+    std::string GenBinary;
+
+  public:
+    CompiledKernel(KernelInfo &&KI, const FINALIZER_INFO &JI,
+                   ArrayRef<char> GenBin);
+
+    const KernelInfo &getKernelInfo() const { return CompilerInfo; }
+    const FINALIZER_INFO &getJitterInfo() const { return JitterInfo; }
+    const std::string &getGenBinary() const { return GenBinary; }
+  };
+
+public:
+  using KernelStorageTy = std::vector<CompiledKernel>;
+
+  using kernel_iterator = KernelStorageTy::iterator;
+  using kernel_const_iterator = KernelStorageTy::const_iterator;
+  using kernel_size_type = KernelStorageTy::size_type;
+
+private:
+  KernelStorageTy Kernels;
+
+public:
+  static char ID;
+
+  GenXOCLRuntimeInfo() : ImmutablePass(ID) {
+    initializeGenXOCLRuntimeInfoPass(*PassRegistry::getPassRegistry());
+  }
+
+  // Save kernel info and jit info for given function in this pass.
+  void saveCompiledKernel(CompiledKernel &&KD) {
+    Kernels.push_back(std::move(KD));
+  }
+
+  // Move compiled kernels out of this pass.
+  KernelStorageTy stealCompiledKernels() { return std::move(Kernels); }
+
+  // Kernel descriptor accessors.
+  kernel_iterator kernel_begin() { return Kernels.begin(); }
+  kernel_iterator kernel_end() { return Kernels.end(); }
+  kernel_const_iterator kernel_begin() const { return Kernels.begin(); }
+  kernel_const_iterator kernel_end() const { return Kernels.end(); }
+  iterator_range<kernel_iterator> kernels() {
+    return {kernel_begin(), kernel_end()};
+  }
+  iterator_range<kernel_const_iterator> kernels() const {
+    return {kernel_begin(), kernel_end()};
+  }
+  kernel_size_type kernel_size() const { return Kernels.size(); }
+  bool kernel_empty() const { return Kernels.empty(); }
+};
+
+ModulePass *createGenXOCLInfoExtractorPass(
+    std::vector<GenXOCLRuntimeInfo::CompiledKernel> &Dest);
+} // namespace llvm
+
+#endif
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPatternMatch.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPatternMatch.cpp
new file mode 100644
index 000000000000..9cb4239c2472
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPatternMatch.cpp
@@ -0,0 +1,2640 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXPatternMatch
+/// ----------------
+///
+/// This pass performs a small number of GenX-specific peephole optimizations.
+///
+/// It is named *pattern match* with the idea that it is analogous to the
+/// pattern matching pass in IGC. However IGC's pattern matching is more
+/// extensive, and I believe some of its functionality is covered by GenXBaling
+/// in the GenX backend.
+///
+/// * Turns fp and integer mul+add into mad, if it decides it is profitable.
+///
+///   For an integer mul+add, the pass looks at the inputs after accounting for
+///   extends that will get baled into the operation in the GenX backend, or
+///   folded into the instruction in the finalizer, and it uses mad only if both
+///   inputs are short or byte. Our experience on HSW was that using int mad
+///   where the inputs are actually 32 bit ints is counterproductive because of
+///   the way that the finalizer has to implement it using the hardware's 32x16
+///   multiply.
+///
+///   However, this criterion could probably be looser on any arch that has a
+///   32x32 multiply (BDW+, but excluding some later LP variants). This is
+///   something to investigate.
+///
+///   To implement this, the pass would need to use GenXSubtarget, and there
+///   would need to be a has32x32Multiply flag in GenXSubtarget.
+///
+/// * Turns cmp+sel into min/max if possible.
+///
+/// * Flips a boolean not if profitable.
+///
+/// * Cleanup predicate region reads if possible.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "GENX_PATTERN_MATCH"
+#include "GenX.h"
+#include "GenXConstants.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "GenXVectorDecomposer.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsicInst.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <functional>
+#include <limits>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+using namespace genx;
+
+STATISTIC(NumOfMadMatched, "Number of mad instructions matched");
+STATISTIC(NumOfMinMaxMatched, "Number of min/max instructions matched");
+
+static cl::opt<bool> EnableMadMatcher("enable-mad", cl::init(true), cl::Hidden,
+                                      cl::desc("Enable mad matching."));
+
+static cl::opt<bool> EnableMinMaxMatcher("enable-minmax", cl::init(true),
+                                         cl::Hidden,
+                                         cl::desc("Enable min/max matching."));
+
+namespace {
+
+class GenXPatternMatch : public FunctionPass,
+                         public InstVisitor<GenXPatternMatch> {
+  DominatorTree *DT = nullptr;
+  LoopInfo *LI = nullptr;
+  const DataLayout *DL = nullptr;
+  const TargetOptions *Options;
+  // Indicates whether there is any change.
+  bool Changed = false;
+
+public:
+  static char ID;
+  GenXPatternMatch(const TargetOptions *Options = nullptr)
+      : FunctionPass(ID), Options(Options) {}
+
+  StringRef getPassName() const override { return "GenX pattern match"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<GenXModule>();
+    AU.setPreservesCFG();
+  }
+
+  void visitBinaryOperator(BinaryOperator &I);
+
+  void visitCallInst(CallInst &I);
+
+  void visitSelectInst(SelectInst &I);
+
+  void visitFDiv(BinaryOperator &I);
+
+  void visitICmpInst(ICmpInst &I);
+
+  bool runOnFunction(Function &F) override;
+
+  bool isFpMadEnabled() const {
+    return EnableMadMatcher &&
+           (!Options || Options->AllowFPOpFusion != FPOpFusion::Strict);
+  }
+
+private:
+  // flipBoolNot : flip a (vector) bool not instruction if beneficial
+  bool flipBoolNot(Instruction *Inst);
+  // foldBoolAnd : fold a (vector) bool and into sel/wrregion if beneficial
+  bool foldBoolAnd(Instruction *Inst);
+  bool simplifyPredRegion(CallInst *Inst);
+  bool simplifyWrRegion(CallInst *Inst);
+  bool simplifyRdRegion(CallInst* Inst);
+  bool simplifyTruncSat(CallInst *Inst);
+  bool simplifySelect(Function *F);
+  bool simplifyVolatileGlobals(Function *F);
+  bool decomposeSelect(Function *F);
+  // Preprocessing to help generate integer MAD.
+  bool distributeIntegerMul(Function *F);
+  bool propagateFoldableRegion(Function *F);
+  bool reassociateIntegerMad(Function *F);
+  bool vectorizeConstants(Function *F);
+  bool placeConstants(Function *F);
+  bool simplifyCmp(CmpInst *Cmp);
+  CmpInst *reduceCmpWidth(CmpInst *Cmp);
+  bool simplifyNullDst(CallInst *Inst);
+  // Transform logic operation with a mask from <N x iM> to <N/(32/M) x i32>
+  bool extendMask(BinaryOperator *BO);
+};
+
+} // namespace
+
+char GenXPatternMatch::ID = 0;
+
+namespace llvm {
+void initializeGenXPatternMatchPass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXPatternMatch, "GenXPatternMatch", "GenXPatternMatch",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GenXPatternMatch, "GenXPatternMatch", "GenXPatternMatch",
+                    false, false)
+
+FunctionPass *llvm::createGenXPatternMatchPass(const TargetOptions *Options) {
+  initializeGenXPatternMatchPass(*PassRegistry::getPassRegistry());
+  return new GenXPatternMatch(Options);
+}
+
+bool GenXPatternMatch::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DL = &F.getParent()->getDataLayout();
+
+  // Before we get the simd-control-flow representation right,
+  // we avoid dealing with predicate constants
+  loadPhiConstants(&F, DT, true);
+  Changed |= distributeIntegerMul(&F);
+  Changed |= propagateFoldableRegion(&F);
+  Changed |= reassociateIntegerMad(&F);
+  Changed |= placeConstants(&F);
+  Changed |= vectorizeConstants(&F);
+
+  visit(F);
+
+  Changed |= simplifyVolatileGlobals(&F);
+  Changed |= simplifySelect(&F);
+  // Break big predicate variables and run after min/max pattern match.
+  Changed |= decomposeSelect(&F);
+
+  return Changed;
+}
+
+namespace {
+
+// Helper class to share common code.
+class MadMatcher {
+public:
+  explicit MadMatcher(Instruction *I)
+      : AInst(I), MInst(nullptr), ID(GenXIntrinsic::not_any_intrinsic), NegIndex(-1) {
+    assert(I && "null instruction");
+    Srcs[0] = Srcs[1] = Srcs[2] = nullptr;
+  }
+
+  // Match mads with floating point operands.
+  bool matchFpMad();
+
+  // Match integer mads that starts with binary operators.
+  bool matchIntegerMad();
+
+  // Match integer mads that starts with genx_*add intrinsic calls.
+  bool matchIntegerMad(unsigned IID);
+
+private:
+  // Return true if changes are made.
+  bool emit();
+
+  // Check whether it is profitable to emit a mad.
+  //
+  // Each mad out of add implies a duplicated mul and jitter usually can not
+  // remove it in the end.
+  //
+  // It is a bit more subtle for the integer case. Since 32 bit mul is not well
+  // supported in HW, it may lead to worse code if a 32 bit integer mad cannot
+  // be emitted as mac in the end and mul + mach could be emitted.
+  bool isProfitable() const;
+
+  // Checks whether a fp mad is being matched or not.
+  bool isFpMad() const { return ID == Intrinsic::fma; }
+
+  void setMInst(Instruction *I) { MInst = I; }
+
+  // Checks whether 'MInst' is an integer shift, which could be turned back to
+  // an integer muliplication.
+  bool isLShift() const { return MInst->getOpcode() == Instruction::Shl; }
+
+  std::tuple<Value *, bool> getNarrowI16Vector(IRBuilder<> &, Instruction *,
+                                               Value *, unsigned) const;
+
+private:
+  // The instruction starts the mad matching:
+  // * fadd/fsub
+  // * add/sub
+  // * genx_*add
+  Instruction *AInst;
+
+  // The instruction being sinked into:
+  // * fmul
+  // * mul/shl
+  // * genx_*mul
+  Instruction *MInst;
+
+  // The mad intrinsic ID.
+  unsigned ID;
+
+  // Source operands for the mad intrinsic call, representing mad as
+  // srcs[0] * srcs[1] + srcs[2].
+  Value *Srcs[3];
+
+  // Indicates whether Srcs[NegIndex] needs to be negated. Value -1 means no
+  // negation is needed.
+  int NegIndex;
+};
+
+
+
+// Class to identify cases where a comparison and select are equivalent to a
+// min or max operation. These are replaced by a min/max intrinsic which allows
+// the jitter to produce better code for these cases.
+class MinMaxMatcher {
+public:
+  explicit MinMaxMatcher(Instruction *I)
+      : SelInst(I), CmpInst(nullptr), ID(GenXIntrinsic::not_any_intrinsic) {
+    assert(I && "null instruction");
+    Srcs[0] = Srcs[1] = nullptr;
+    Annotation = 0;
+  }
+
+  // Match select instruction that are equivalent to min/max
+  bool matchMinMax();
+
+  bool valuesMatch(llvm::Value *Op1, llvm::Value *Op2);
+
+  static bool isEnabled() { return EnableMinMaxMatcher; }
+
+private:
+  // Return true if changes are made.
+  bool emit();
+
+  void setSelInst(Instruction *I) { SelInst = I; }
+
+private:
+  // The select instruction
+  Instruction *SelInst;
+
+  // The compare instruction
+  llvm::CmpInst *CmpInst;
+
+  // The min/max intrinsic ID.
+  unsigned ID;
+
+  // Source operands for the min/max intrinsic call
+  Value *Srcs[2];
+
+  // Effective operands for the cmp ignoring some casts
+  Value *CmpSrcs[2];
+
+  // Annotation for the min/max call
+  const char *Annotation;
+};
+
+} // namespace
+
+void GenXPatternMatch::visitBinaryOperator(BinaryOperator &I) {
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  const GenXSubtarget *ST = P ? P->getSubtarget() : nullptr;
+  if (isPredNot(&I))
+    Changed |= flipBoolNot(&I);
+  else
+    switch (I.getOpcode()) {
+    default:
+      break;
+    case Instruction::FAdd:
+    case Instruction::FSub:
+      Changed |= isFpMadEnabled() && MadMatcher(&I).matchFpMad();
+      break;
+    case Instruction::Add:
+    case Instruction::Sub:
+      if (EnableMadMatcher && MadMatcher(&I).matchIntegerMad())
+        Changed = true;
+      break;
+    case Instruction::And:
+      if (I.getType()->getScalarType()->isIntegerTy(1)) {
+        if (foldBoolAnd(&I))
+          Changed = true;
+      } else if (extendMask(&I))
+        Changed = true;
+      break;
+    case Instruction::Or:
+    case Instruction::Xor:
+      if (extendMask(&I))
+        Changed = true;
+      break;
+    }
+}
+
+void GenXPatternMatch::visitCallInst(CallInst &I) {
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  const GenXSubtarget *ST = P ? P->getSubtarget() : nullptr;
+  switch (unsigned ID = GenXIntrinsic::getGenXIntrinsicID(&I)) {
+  default:
+    break;
+  case GenXIntrinsic::genx_ssadd_sat:
+  case GenXIntrinsic::genx_suadd_sat:
+  case GenXIntrinsic::genx_usadd_sat:
+  case GenXIntrinsic::genx_uuadd_sat:
+    if (EnableMadMatcher && MadMatcher(&I).matchIntegerMad(ID))
+      Changed = true;
+    break;
+  case GenXIntrinsic::genx_rdpredregion:
+    Changed |= simplifyPredRegion(&I);
+    break;
+  case GenXIntrinsic::genx_wrregioni:
+  case GenXIntrinsic::genx_wrregionf:
+    Changed |= simplifyWrRegion(&I);
+    break;
+  case GenXIntrinsic::genx_rdregioni:
+  case GenXIntrinsic::genx_rdregionf:
+    Changed |= simplifyRdRegion(&I);
+    break;
+  case GenXIntrinsic::genx_sstrunc_sat:
+  case GenXIntrinsic::genx_sutrunc_sat:
+  case GenXIntrinsic::genx_ustrunc_sat:
+  case GenXIntrinsic::genx_uutrunc_sat:
+    Changed |= simplifyTruncSat(&I);
+    break;
+  case GenXIntrinsic::genx_dword_atomic_add:
+  case GenXIntrinsic::genx_dword_atomic_and:
+  case GenXIntrinsic::genx_dword_atomic_cmpxchg:
+  case GenXIntrinsic::genx_dword_atomic_dec:
+  case GenXIntrinsic::genx_dword_atomic_fcmpwr:
+  case GenXIntrinsic::genx_dword_atomic_fmax:
+  case GenXIntrinsic::genx_dword_atomic_fmin:
+  case GenXIntrinsic::genx_dword_atomic_imax:
+  case GenXIntrinsic::genx_dword_atomic_imin:
+  case GenXIntrinsic::genx_dword_atomic_max:
+  case GenXIntrinsic::genx_dword_atomic_min:
+  case GenXIntrinsic::genx_dword_atomic_or:
+  case GenXIntrinsic::genx_dword_atomic_sub:
+  case GenXIntrinsic::genx_dword_atomic_xchg:
+  case GenXIntrinsic::genx_dword_atomic_xor:
+    Changed |= simplifyNullDst(&I);
+    break;
+  }
+}
+
+void GenXPatternMatch::visitICmpInst(ICmpInst &I) {
+  // Ignore dead comparison.
+  if (I.use_empty())
+    return;
+
+  Value *V0 = nullptr;
+  Constant *C1 = nullptr;
+  Constant *C2 = nullptr;
+  ICmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+
+  // Transform icmp (V0 & 65535), C2 ==> icmp (trunc V0 to i16), C2.
+  // TODO: Only consider unsigned comparisons so do not inspect the sign bit.
+  if (I.isUnsigned() &&
+      match(&I, m_ICmp(Pred, m_OneUse(m_And(m_Value(V0), m_Constant(C1))),
+                       m_Constant(C2))) &&
+      C1->getType()->isVectorTy()) {
+    Type *Ty = V0->getType();
+    if (auto Elt = dyn_cast_or_null<ConstantInt>(C1->getSplatValue())) {
+      auto Known = computeKnownBits(C2, *DL);
+      unsigned NBits = Known.Zero.countLeadingOnes();
+
+      IRBuilder<> Builder(&I);
+      uint64_t Int16Mask = std::numeric_limits<uint16_t>::max();
+      uint64_t Int8Mask = std::numeric_limits<uint8_t>::max();
+
+      // Check if it is safe to truncate to lower type without loss of bits.
+      Type *DstTy = nullptr;
+      uint64_t Val = Elt->getZExtValue();
+      unsigned NElts = Ty->getVectorNumElements();
+      unsigned BitWidth = Elt->getType()->getPrimitiveSizeInBits();
+      if (Val == Int16Mask && NBits + 16 >= BitWidth)
+        DstTy = VectorType::get(Builder.getInt16Ty(), NElts);
+      else if (Val == Int8Mask && NBits + 8 >= BitWidth)
+        DstTy = VectorType::get(Builder.getInt8Ty(), NElts);
+
+      // Lower trunc to bitcast followed by a region read
+      // as such bitcast is not support after IR lowering.
+      if (DstTy) {
+        Type *InEltTy = Ty->getVectorElementType();
+        Type *OutEltTy = DstTy->getVectorElementType();
+        assert(OutEltTy->getPrimitiveSizeInBits());
+        unsigned Stride = InEltTy->getPrimitiveSizeInBits() /
+                          OutEltTy->getPrimitiveSizeInBits();
+        // Create the new bitcast.
+        Instruction *BC = CastInst::Create(
+            Instruction::BitCast, V0, VectorType::get(OutEltTy, Stride * NElts),
+            ".bc", &I /*InsertBefore*/);
+        BC->setDebugLoc(I.getDebugLoc());
+
+        // Create the new rdregion.
+        Region R(BC);
+        R.NumElements = NElts;
+        R.Stride = Stride;
+        R.Width = NElts;
+        R.VStride = R.Stride * R.Width;
+        Value *LHS = R.createRdRegion(BC, "", &I /*InsertBefore*/,
+                                      I.getDebugLoc(), false /*AllowScalar*/);
+        Value *RHS = Builder.CreateTrunc(C2, DstTy);
+        assert(isa<Constant>(RHS));
+        Value *NewICmp = Builder.CreateICmp(Pred, LHS, RHS);
+        if (auto Inst = dyn_cast<Instruction>(NewICmp))
+          Inst->setDebugLoc(I.getDebugLoc());
+        I.replaceAllUsesWith(NewICmp);
+        Changed = true;
+      }
+    }
+  }
+
+  // Explore (icmp.ne V0, 0) where V0 is promoted from i1.
+  if (match(&I, m_ICmp(Pred, m_Value(V0), m_Zero())) &&
+      Pred == CmpInst::ICMP_NE) {
+    // V0 is calculated from AND, OR, NOT, and (select (cmp ...), 0, 1)
+    SmallVector<Value *, 8> WorkList;
+    SmallVector<Value *, 8> PreOrder;
+    bool Profitable = true;
+    WorkList.push_back(V0);
+    while (!WorkList.empty()) {
+      Value *V = WorkList.pop_back_val();
+      Value *LHS = nullptr, *RHS = nullptr;
+      if (match(V, m_OneUse(m_Or(m_Value(LHS), m_Value(RHS))))) {
+        WorkList.push_back(LHS);
+        WorkList.push_back(RHS);
+        PreOrder.push_back(V);
+        continue;
+      }
+      if (match(V, m_OneUse(m_And(m_Value(LHS), m_Value(RHS))))) {
+        WorkList.push_back(LHS);
+        WorkList.push_back(RHS);
+        PreOrder.push_back(V);
+        continue;
+      }
+      if (match(V, m_OneUse(m_Not(m_Value(LHS))))) {
+        WorkList.push_back(LHS);
+        PreOrder.push_back(V);
+        continue;
+      }
+      Value *Cond = nullptr;
+      if (match(V, m_OneUse(m_Select(m_Value(Cond), m_One(), m_Zero())))) {
+        PreOrder.push_back(Cond);
+        continue;
+      }
+      Profitable = false;
+      break;
+    }
+    if (Profitable) {
+      IRBuilder<> Builder(&I);
+      // For simplicity, a stack is used to reconstruct tree. With a next
+      // pointer, that stack is not necessary.
+      SmallVector<Value *, 8> OpStack;
+      while (!PreOrder.empty()) {
+        Value *V = PreOrder.pop_back_val();
+        if (V->getType()->getScalarType()->isIntegerTy(1)) {
+          OpStack.push_back(V);
+          continue;
+        }
+        Value *LHS, *RHS;
+        if (match(V, m_Or(m_Value(LHS), m_Value(RHS)))) {
+          assert(OpStack.size() >= 2);
+          RHS = OpStack.pop_back_val();
+          LHS = OpStack.pop_back_val();
+          OpStack.push_back(Builder.CreateOr(LHS, RHS));
+          continue;
+        }
+        if (match(V, m_And(m_Value(LHS), m_Value(RHS)))) {
+          assert(OpStack.size() >= 2);
+          RHS = OpStack.pop_back_val();
+          LHS = OpStack.pop_back_val();
+          OpStack.push_back(Builder.CreateAnd(LHS, RHS));
+          continue;
+        }
+        if (match(V, m_Not(m_Value(LHS)))) {
+          assert(OpStack.size() >= 1);
+          LHS = OpStack.pop_back_val();
+          OpStack.push_back(Builder.CreateNot(LHS));
+        }
+        assert(false && "Unhandled logic op!");
+      }
+      assert(OpStack.size() == 1);
+      I.replaceAllUsesWith(OpStack.pop_back_val());
+      Changed = true;
+      return;
+    }
+  }
+
+  // Skip the following optimization specific to scalar comparison.
+  if (!I.getType()->isIntegerTy(1))
+    return;
+
+  // Transform the evaluation of flag == 0 into (~flag).all().
+  // TODO: Transform flag != 0 into flag.any().
+  if (match(&I, m_ICmp(Pred, m_OneUse(m_BitCast(m_OneUse(m_Value(V0)))),
+                       m_Zero())) &&
+      Pred == CmpInst::ICMP_EQ && isa<CmpInst>(V0) &&
+      V0->getType()->isVectorTy() &&
+      V0->getType()->getScalarType()->isIntegerTy(1)) {
+    VectorType *VTy = cast<VectorType>(V0->getType());
+    unsigned NumElts = VTy->getNumElements();
+    if (NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) {
+      IRBuilder<> Builder(&I);
+      auto Cmp = cast<CmpInst>(V0);
+      // Inverse the evaluation of flag.
+      Cmp->setPredicate(Cmp->getInversePredicate());
+      if (auto NewCmp = reduceCmpWidth(Cmp)) {
+        // Once the cmp could be reduced into narrower one (with the assumption
+        // that the reduced part is always TRUE), reduce it into narrow one.
+        Cmp = NewCmp;
+        VTy = cast<VectorType>(Cmp->getType());
+      }
+      simplifyCmp(Cmp);
+      // Call 'all'.
+      auto M = I.getParent()->getParent()->getParent();
+      auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_all, VTy);
+      auto NewVal = Builder.CreateCall(Fn, Cmp);
+      I.replaceAllUsesWith(NewVal);
+      Changed = true;
+      return;
+    }
+  }
+}
+
+// Simplify the sequence of (cmp.eq (and (wrregion zero v), 1), 0) to
+// (cmp.eq (and v, 1), 0) with a narrow vector length with the assumption that
+// the reduced part will be always TRUE.
+CmpInst *GenXPatternMatch::reduceCmpWidth(CmpInst *Cmp) {
+  ICmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+  Value *V0 = nullptr;
+  if (!Cmp->hasOneUse() || !Cmp->getType()->isVectorTy() ||
+      !match(Cmp, m_ICmp(Pred, m_And(m_Value(V0), m_One()), m_Zero())) ||
+      Pred != CmpInst::ICMP_EQ || !GenXIntrinsic::isWrRegion(V0))
+    return nullptr;
+
+  GenXIntrinsicInst *WII = cast<GenXIntrinsicInst>(V0);
+  if (!match(WII->getOperand(0), m_Zero()))
+    return nullptr;
+
+  V0 = WII->getOperand(1);
+  VectorType *VTy = cast<VectorType>(V0->getType());
+  unsigned NumElts = VTy->getNumElements();
+
+  Region R(WII, BaleInfo());
+  if (R.Indirect || R.Offset || R.VStride || R.Stride != 1 ||
+      R.Width != NumElts)
+    return nullptr;
+  if (R.Width != 2 && R.Width != 4 && R.Width != 8 && R.Width != 16)
+    return nullptr;
+
+  // As the rest parts of the original vector are all zeros, the sequence could
+  // be reduced into a narrower one (R.Width) and skip the wrregion.
+  IRBuilder<> Builder(Cmp);
+
+  auto One = ConstantInt::get(VTy, 1);
+  auto Zero = Constant::getNullValue(VTy);
+
+  auto V1 = Builder.CreateAnd(V0, One);
+  auto V2 = Builder.CreateICmp(Pred, V1, Zero);
+
+  return cast<CmpInst>(V2);
+}
+
+// Simplify the sequence of (cmp (and (select (cmp ...) 1, 0), 1), 0)
+bool GenXPatternMatch::simplifyCmp(CmpInst *Cmp) {
+  ICmpInst::Predicate P0 = ICmpInst::BAD_ICMP_PREDICATE;
+  ICmpInst::Predicate P1 = ICmpInst::BAD_ICMP_PREDICATE;
+  Value *LHS = nullptr;
+  Value *RHS = nullptr;
+  if (!match(Cmp, m_ICmp(P0,
+                         m_And(m_Select(m_ICmp(P1, m_Value(LHS), m_Value(RHS)),
+                                        m_One(), m_Zero()),
+                               m_One()),
+                         m_Zero())))
+    return false;
+  if (P0 != ICmpInst::ICMP_EQ && P0 != ICmpInst::ICMP_NE)
+    return false;
+  if (P0 == ICmpInst::ICMP_EQ)
+    P1 = ICmpInst::getInversePredicate(P1);
+  Cmp->setPredicate(P1);
+  Cmp->setOperand(0, LHS);
+  Cmp->setOperand(1, RHS);
+  return true;
+}
+
+/***********************************************************************
+ * notHasRealUse : detect whether an instruction has a use that counts as
+ *      a "real" use of a bool not, that is one where it would need to be
+ *      calculated rather than just baled in
+ */
+static bool notHasRealUse(Instruction *Inst) {
+  for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+    auto user = cast<Instruction>(ui->getUser());
+    if (isPredNot(user))
+      continue;
+    if (isa<SelectInst>(user))
+      continue;
+    if (user->use_empty())
+      continue; // ignore dead instruction
+    switch (GenXIntrinsic::getGenXIntrinsicID(user)) {
+    case GenXIntrinsic::genx_any:
+    case GenXIntrinsic::genx_all:
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+      continue;
+    default:
+      return true;
+    }
+  }
+  return false;
+}
+
+/***********************************************************************
+ * GenXPatternMatch::flipBoolNot : attempt to flip (vector) bool not
+ *
+ * A vector bool not is bad if its value actually needs to be calculated,
+ * as opposed to just baling it into a predicate field. In gen code,
+ * calculating it involves using a sel to get it into a GRF, then doing
+ * an xor that sets flags. Here we call any use that requires it to be
+ * calculated a "real" use.
+ *
+ * This code detects the case that:
+ * 1. the not has at least one "real" use
+ * 2. the input to the not is the result of a cmp and does not have any
+ *    "real" use.
+ * If these conditions hold, then we flip the not by inverting the
+ * cmp and replacing uses of the not with the new inverted cmp. If the
+ * original cmp has any uses other than the original not, then we create
+ * a new not and change uses to that.
+ *
+ * In this way we save an actual calculation of the original not.
+ *
+ * We only do this for a v16i1 or smaller.
+ */
+bool GenXPatternMatch::flipBoolNot(Instruction *Inst) {
+  if (Inst->getType()->getPrimitiveSizeInBits() > 16)
+    return false; // too big
+  auto Input = dyn_cast<CmpInst>(Inst->getOperand(0));
+  if (!Input)
+    return false; // input not cmp
+  if (!notHasRealUse(Inst))
+    return false; // result of not has no "real" use
+  if (notHasRealUse(Input))
+    return false; // input has a "real" use, so we don't want to flip
+  // We want to flip the not by inverting the comparison that generates its
+  // input.
+  auto NewCmp = CmpInst::Create(
+      Input->getOpcode(), Input->getInversePredicate(), Input->getOperand(0),
+      Input->getOperand(1), Input->getName() + ".inverted", Input);
+  NewCmp->setDebugLoc(Input->getDebugLoc());
+  Inst->replaceAllUsesWith(NewCmp);
+  if (!Input->use_empty()) {
+    auto NewNot = BinaryOperator::Create(
+        Instruction::Xor, NewCmp, Constant::getAllOnesValue(NewCmp->getType()),
+        "", Input);
+    NewNot->setDebugLoc(Input->getDebugLoc());
+    NewNot->takeName(Inst);
+    Input->replaceAllUsesWith(NewNot);
+  }
+  return true;
+}
+
+/***********************************************************************
+ * foldBoolAnd : fold a (vector) bool and into sel/wrregion if beneficial
+ *
+ * A bool and takes a sequence of 3 gen instructions. Here we detect if
+ * a bool and has a single use in a select or wrregion, and if so we fold
+ * it in to have two selects or rdregion, select, wrregion respectively.
+ *
+ * We only do this for a v16i1 or smaller.
+ */
+bool GenXPatternMatch::foldBoolAnd(Instruction *Inst) {
+  if (Inst->getType()->getPrimitiveSizeInBits() > 16)
+    return false; // too big
+  if (!isa<VectorType>(Inst->getType()))
+    return false; // too small
+  if (!Inst->hasOneUse())
+    return false; // more than one use
+  auto user = cast<Instruction>(Inst->use_begin()->getUser());
+  if (auto Sel = dyn_cast<SelectInst>(user)) {
+    // Fold and into sel.
+    auto NewSel1 = SelectInst::Create(Inst->getOperand(0), Sel->getOperand(1),
+                                      Sel->getOperand(2),
+                                      Sel->getName() + ".foldand", Sel);
+    NewSel1->setDebugLoc(Sel->getDebugLoc());
+    auto NewSel2 = SelectInst::Create(Inst->getOperand(1), NewSel1,
+                                      Sel->getOperand(2), "", Sel);
+    NewSel2->takeName(Sel);
+    NewSel2->setDebugLoc(Sel->getDebugLoc());
+    Sel->replaceAllUsesWith(NewSel2);
+    return true;
+  }
+  if (!GenXIntrinsic::isWrRegion(user))
+    return false;
+  // Fold and into wrregion, giving rdregion, select and wrregion, as long
+  // as the original wrregion is not indirect.
+  Region R(user, BaleInfo());
+  if (R.Indirect)
+    return false;
+  auto NewRdRegion =
+      R.createRdRegion(user->getOperand(0), user->getName() + ".foldand1", user,
+                       user->getDebugLoc(), false);
+  auto NewSel =
+      SelectInst::Create(Inst->getOperand(0), user->getOperand(1), NewRdRegion,
+                         user->getName() + ".foldand2", user);
+  NewSel->setDebugLoc(user->getDebugLoc());
+  R.Mask = Inst->getOperand(1);
+  auto NewWrRegion = cast<Instruction>(R.createWrRegion(
+      user->getOperand(0), NewSel, "", user, user->getDebugLoc()));
+  NewWrRegion->takeName(user);
+  user->replaceAllUsesWith(NewWrRegion);
+  return true;
+}
+
+void GenXPatternMatch::visitSelectInst(SelectInst &I) {
+  Changed |= MinMaxMatcher::isEnabled() && MinMaxMatcher(&I).matchMinMax();
+}
+
+// Trace the def-use chain and return the first non up-cast related value.
+static Value *getEffectiveValueUp(Value *V) {
+  if (isa<ZExtInst>(V) || isa<SExtInst>(V) || isa<BitCastInst>(V))
+    return getEffectiveValueUp(cast<Instruction>(V)->getOperand(0));
+
+  return V;
+}
+
+// Determine whether it is profitable to match a mad. This function assumes
+// that it is valid to match.
+bool MadMatcher::isProfitable() const {
+  // Do not match unused instructions.
+  if (AInst->use_empty())
+    return false;
+
+  // For the following case,
+  // %m = mul %a, %b
+  // %a1 = add %m, %c1
+  // %a2 = add %m, %c2
+  //
+  // If we match them into two mads as
+  //
+  // %m1 = mad(%a, %b, %c1)
+  // %m2 = mad(%a, %b, %c2)
+  //
+  // and it fails to emit two mac/mads then there are redundant instructions in
+  // the end. Conservatively, only match when there is a single use for MInst.
+  //
+  // Update: There are enough cases where this transformation helps spilling
+  // (particularly for long sequences) that mean it is of more value to enable
+  // multiple use cases. May need to revisit. if (!MInst->hasOneUse())
+  //   return false;
+
+  // Do not match x * y +/- 0.0f
+  // FIXME: specify fp mode. ICL certainly is not strict in general.
+  if (Constant *C = dyn_cast<Constant>(Srcs[2]))
+    if (C->isZeroValue())
+      return false;
+
+  // Ignores upward or bit casts, which usually will be performed by copy
+  // propagation within jitter.
+  Value *Vals[] = {getEffectiveValueUp(Srcs[0]), getEffectiveValueUp(Srcs[1]),
+                   getEffectiveValueUp(Srcs[2])};
+
+  auto isIndirectRdRegion = [](Value *V) -> bool {
+    if (!GenXIntrinsic::isRdRegion(V))
+      return false;
+    Region R(cast<Instruction>(V), BaleInfo());
+    return R.Indirect;
+  };
+
+  auto isIndirectWrRegion = [](User *U) -> bool {
+    if (!GenXIntrinsic::isWrRegion(U))
+      return false;
+    Region R(cast<Instruction>(U), BaleInfo());
+    return R.Indirect;
+  };
+
+  // If the result of this mad used solely in an indirect
+  // region write, count it as an indirect access.
+  bool IsIndirectDst = false;
+  if (AInst->hasOneUse()) {
+    User *U = AInst->use_begin()->getUser();
+    IsIndirectDst = isIndirectWrRegion(U);
+  }
+
+  if (isFpMad()) {
+    // Agressive on floating point types since there are fewer constraints,
+    // considering up to one indirect region access to be worthwhile.
+    // For non-FP mads, any indirect region accesses make it not worth
+    // bothering.
+    unsigned IndirectCount = 0;
+    if (isIndirectRdRegion(Vals[0]))
+      IndirectCount++;
+    if (isIndirectRdRegion(Vals[1]))
+      IndirectCount++;
+    if (isIndirectRdRegion(Vals[2]))
+      IndirectCount++;
+    if (IsIndirectDst)
+      IndirectCount++;
+    return IndirectCount <= 1;
+  }
+
+  if (IsIndirectDst || isIndirectRdRegion(Vals[2]) ||
+      (isIndirectRdRegion(Vals[0]) && isIndirectRdRegion(Vals[1])))
+    // For integer mad, we only support indirect access on one of
+    // multiplicative operands.
+    return false;
+
+  // This is an integer mad.
+  // Do not match constant add. I was getting bad results from allowing this,
+  // although it may have been largely from scalar address computations.
+  if (isa<Constant>(Srcs[2]))
+    return false;
+
+  // Do not match unless both of multiplicants are of type *B/*W.
+  bool IsProfitable = true;
+
+  auto Checker = [](Value *V) -> bool {
+    // TODO, handle constants more accurately.
+    if (isa<Constant>(V))
+      return true;
+    const unsigned DWordSizeInBits = 32;
+    return (V->getType()->getScalarSizeInBits() < DWordSizeInBits);
+  };
+
+  auto HasKnownShAmtLT16 = [](Value *V) -> bool {
+    ConstantInt *C = dyn_cast<ConstantInt>(V);
+    if (!C) {
+      if (!isa<Constant>(V))
+        return false;
+      C = dyn_cast<ConstantInt>(cast<Constant>(V)->getSplatValue());
+      if (!C)
+        return false;
+    }
+    return C->getValue().ult(16);
+  };
+
+  IsProfitable = Checker(Vals[0]);
+  if (!IsProfitable)
+    return false;
+
+  IsProfitable = isLShift() ? HasKnownShAmtLT16(Vals[1]) : Checker(Vals[1]);
+  if (!IsProfitable)
+    return false;
+
+  // Safety check on indirect access if any.
+  GenXIntrinsicInst *RII = nullptr;
+  if (isIndirectRdRegion(Vals[0]))
+    RII = cast<GenXIntrinsicInst>(Vals[0]);
+  else if (isIndirectRdRegion(Vals[1]))
+    RII = cast<GenXIntrinsicInst>(Vals[1]);
+
+  // Always profitable if there's no indirect access.
+  if (!RII)
+    return true;
+  // Assume not profitable if the indirect access is defined in another BB to
+  // avoid expensive alias analysis.
+  if (RII->getParent() != AInst->getParent())
+    return false;
+
+  return IsProfitable;
+}
+
+static Value *getBroadcastFromScalar(Value *V) {
+  VectorType *VTy = dyn_cast<VectorType>(V->getType());
+  // Skip if it's not vector type.
+  if (!VTy)
+    return nullptr;
+  // Skip if it's not from rdregion.
+  if (!GenXIntrinsic::isRdRegion(V))
+    return nullptr;
+  GenXIntrinsicInst *RII = cast<GenXIntrinsicInst>(V);
+  Region R(RII, BaleInfo());
+  if (!R.isScalar() || R.Width != 1 || R.Offset != 0)
+    return nullptr;
+  Value *Src = RII->getArgOperand(0);
+  auto *BC = dyn_cast<BitCastInst>(Src);
+  if (!BC)
+    return nullptr;
+  VTy = dyn_cast<VectorType>(BC->getType());
+  if (!VTy || VTy->getNumElements() != 1 ||
+      VTy->getScalarType() != BC->getOperand(0)->getType())
+    return nullptr;
+  return BC->getOperand(0);
+}
+
+class FAddOperator
+    : public ConcreteOperator<FPMathOperator, Instruction::FAdd> {};
+
+class FSubOperator
+    : public ConcreteOperator<FPMathOperator, Instruction::FSub> {};
+
+class FMulOperator
+    : public ConcreteOperator<FPMathOperator, Instruction::FMul> {};
+
+class ExtOperator : public Operator {
+public:
+  static bool isExtOpcode(unsigned Opc) {
+    return Opc == Instruction::SExt || Opc == Instruction::ZExt;
+  }
+  static inline bool classof(const Instruction *I) {
+    return isExtOpcode(I->getOpcode());
+  }
+  static inline bool classof(const ConstantExpr *CE) {
+    return isExtOpcode(CE->getOpcode());
+  }
+  static inline bool classof(const Value *V) {
+    return (isa<Instruction>(V) && classof(cast<Instruction>(V))) ||
+           (isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V)));
+  }
+};
+
+class MulLikeOperator : public Operator {
+public:
+  static bool isMulLikeOpcode(unsigned Opc) {
+    return Opc == Instruction::Mul || Opc == Instruction::Shl;
+  }
+  static inline bool classof(const Instruction *I) {
+    return isMulLikeOpcode(I->getOpcode());
+  }
+  static inline bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+};
+
+std::tuple<Value *, bool>
+MadMatcher::getNarrowI16Vector(IRBuilder<> &Builder, Instruction *AInst,
+                               Value *V, unsigned NumElts) const {
+  assert(V->getType()->getScalarType()->isIntegerTy(32) && "I32 is expected!");
+  if (auto Ext = dyn_cast<ExtOperator>(V)) {
+    V = Ext->getOperand(0);
+    if (V->getType()->getScalarType()->isIntegerTy(8)) {
+      Type *DstTy = Builder.getInt16Ty();
+      if (auto VTy = dyn_cast<VectorType>(V->getType()))
+        DstTy = VectorType::get(DstTy, VTy->getNumElements());
+      // Extend to i16 first.
+      V = Builder.CreateCast(Instruction::CastOps(Ext->getOpcode()), V, DstTy);
+    }
+    if (!V->getType()->isVectorTy()) {
+      // Broadcast through rdregion.
+      Type *NewTy = VectorType::get(V->getType(), 1);
+      V = Builder.CreateBitCast(V, NewTy);
+      Region R(V);
+      R.Offset = 0;
+      R.Width = 1;
+      R.Stride = R.VStride = 0;
+      R.NumElements = NumElts;
+      V = R.createRdRegion(V, ".splat", AInst, AInst->getDebugLoc());
+    }
+    return std::make_tuple(V, Ext->getOpcode() == Instruction::SExt);
+  }
+  if (auto CI = dyn_cast<ConstantInt>(V)) {
+    const APInt &Val = CI->getValue();
+    if (Val.isIntN(16)) {
+      V = ConstantVector::getSplat(NumElts,
+                                   Builder.getIntN(16, Val.getZExtValue()));
+      return std::make_tuple(V, Val.isSignedIntN(16));
+    }
+  }
+  return std::make_tuple(nullptr, false);
+}
+
+// The floating point case is relatively simple. Only need to match with fmul.
+bool MadMatcher::matchFpMad() {
+  assert(AInst->getOpcode() == Instruction::FAdd ||
+         AInst->getOpcode() == Instruction::FSub);
+  Value *Ops[2] = {AInst->getOperand(0), AInst->getOperand(1)};
+
+  for (unsigned Idx = 0; Idx != 2; ++Idx) {
+    Value *Op0 = Ops[Idx];
+    Value *Op1 = Ops[1 - Idx];
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0)) {
+      // Case +/-(X * Y) +/- Z
+      if (BO->getOpcode() == Instruction::FMul) {
+        Srcs[0] = BO->getOperand(0);
+        Srcs[1] = BO->getOperand(1);
+        Srcs[2] = Op1;
+
+        setMInst(BO);
+        if (AInst->getOpcode() == Instruction::FSub)
+          NegIndex = 2 - Idx;
+        break;
+      }
+    }
+    if (!MInst) {
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op1)) {
+        // Case Z +/- X * Y
+        if (BO->getOpcode() == Instruction::FMul) {
+          Srcs[0] = BO->getOperand(0);
+          Srcs[1] = BO->getOperand(1);
+          Srcs[2] = Op0;
+
+          setMInst(BO);
+          if (AInst->getOpcode() == Instruction::FSub)
+            NegIndex = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  // No genx intrinsic mad for the fp case.
+  ID = Intrinsic::fma;
+
+  // Emit mad if matched and profitable.
+  return emit();
+}
+
+bool MadMatcher::matchIntegerMad() {
+  assert(AInst->getOpcode() == Instruction::Add ||
+         AInst->getOpcode() == Instruction::Sub);
+  Value *Ops[2] = {AInst->getOperand(0), AInst->getOperand(1)};
+
+  if (auto BI = dyn_cast<MulLikeOperator>(Ops[0])) {
+    // Case X * Y +/- Z
+    Srcs[2] = Ops[1];
+    Srcs[1] = BI->getOperand(1);
+    Srcs[0] = BI->getOperand(0);
+    setMInst(cast<Instruction>(BI));
+    if (isProfitable()) {
+      if (AInst->getOpcode() == Instruction::Sub)
+        NegIndex = 2;
+    } else
+      setMInst(nullptr);
+  }
+
+  if (!MInst) {
+    if (auto BI = dyn_cast<MulLikeOperator>(Ops[1])) {
+      // Case Z +/- X * Y
+      Srcs[2] = Ops[0];
+      Srcs[1] = BI->getOperand(1);
+      Srcs[0] = BI->getOperand(0);
+      setMInst(cast<Instruction>(BI));
+      if (isProfitable()) {
+        if (AInst->getOpcode() == Instruction::Sub)
+          NegIndex = 1;
+      } else
+        setMInst(nullptr);
+    }
+  }
+
+  if (!MInst) { // Check if operand 0 is broadcasted from scalar.
+    if (auto S = getBroadcastFromScalar(Ops[0])) {
+      if (auto BI = dyn_cast<MulLikeOperator>(S)) {
+        // Case X * Y +/- Z
+        Srcs[2] = Ops[1];
+        Srcs[1] = BI->getOperand(1);
+        Srcs[0] = BI->getOperand(0);
+        setMInst(cast<Instruction>(BI));
+        if (isProfitable()) {
+          if (AInst->getOpcode() == Instruction::Sub)
+            NegIndex = 2;
+        } else
+          setMInst(nullptr);
+      }
+    }
+  }
+
+  if (!MInst) { // Check if operand 1 is broadcasted from scalar.
+    if (auto S = getBroadcastFromScalar(Ops[1])) {
+      if (auto BI = dyn_cast<MulLikeOperator>(S)) {
+        // Case X * Y +/- Z
+        Srcs[2] = Ops[0];
+        Srcs[1] = BI->getOperand(1);
+        Srcs[0] = BI->getOperand(0);
+        setMInst(cast<Instruction>(BI));
+        if (isProfitable()) {
+          if (AInst->getOpcode() == Instruction::Sub)
+            NegIndex = 1;
+        } else
+          setMInst(nullptr);
+      }
+    }
+  }
+
+  // Always use ssmad.
+  ID = GenXIntrinsic::genx_ssmad;
+
+  // Emit mad if matched and profitable.
+  return emit();
+}
+
+bool MadMatcher::matchIntegerMad(unsigned IID) {
+  assert((GenXIntrinsic::getAnyIntrinsicID(AInst) == IID) && "input out of sync");
+  Value *Ops[2] = {AInst->getOperand(0), AInst->getOperand(1)};
+
+  // TODO: handle cases like: cm_add(cm_mul(u, v), w).
+  if (BinaryOperator *BI = dyn_cast<BinaryOperator>(Ops[0])) {
+    if (BI->getOpcode() == Instruction::Mul ||
+        BI->getOpcode() == Instruction::Shl) {
+      // Case X * Y +/- Z
+      Srcs[2] = Ops[1];
+      Srcs[1] = BI->getOperand(1);
+      Srcs[0] = BI->getOperand(0);
+      setMInst(BI);
+      if (!isProfitable())
+        setMInst(nullptr);
+    }
+  }
+  if (!MInst) {
+    if (BinaryOperator *BI = dyn_cast<BinaryOperator>(Ops[1])) {
+      // Case Z +/- X * Y
+      if (BI->getOpcode() == Instruction::Mul ||
+          BI->getOpcode() == Instruction::Shl) {
+        Srcs[2] = Ops[0];
+        Srcs[1] = BI->getOperand(1);
+        Srcs[0] = BI->getOperand(0);
+        setMInst(BI);
+        if (!isProfitable())
+          setMInst(nullptr);
+      }
+    }
+  }
+
+  switch (IID) {
+  default:
+    llvm_unreachable("unexpected intrinsic ID");
+  case GenXIntrinsic::genx_ssadd_sat:
+    ID = GenXIntrinsic::genx_ssmad_sat;
+    break;
+  case GenXIntrinsic::genx_suadd_sat:
+    ID = GenXIntrinsic::genx_sumad_sat;
+    break;
+  case GenXIntrinsic::genx_usadd_sat:
+    ID = GenXIntrinsic::genx_usmad_sat;
+    break;
+  case GenXIntrinsic::genx_uuadd_sat:
+    ID = GenXIntrinsic::genx_uumad_sat;
+    break;
+  }
+
+  // Emit mad if matched and profitable.
+  return emit();
+}
+
+bool MadMatcher::emit() {
+  if (MInst == nullptr || !isProfitable())
+    return false;
+
+  IRBuilder<> Builder(AInst);
+
+  VectorType *VTy = dyn_cast<VectorType>(Srcs[2]->getType());
+  if (!isFpMad() && VTy && VTy->getScalarType()->isIntegerTy(32)) {
+    Value *V = getBroadcastFromScalar(Srcs[2]);
+    if (!V)
+      V = Srcs[2];
+    auto BO = dyn_cast<BinaryOperator>(V);
+    if (BO && BO->getOpcode() == Instruction::Mul) {
+      // If both operands could be reduced to narrow integer types, use 'mul'
+      // intrinsic.
+      Value *V0 = nullptr, *V1 = nullptr;
+      bool S0 = false, S1 = false;
+      std::tie(V0, S0) = getNarrowI16Vector(Builder, AInst, BO->getOperand(0),
+                                            VTy->getNumElements());
+      std::tie(V1, S1) = getNarrowI16Vector(Builder, AInst, BO->getOperand(1),
+                                            VTy->getNumElements());
+      if (V0 && V1) {
+        GenXIntrinsic::ID IID =
+            S0 ? (S1 ? GenXIntrinsic::genx_ssmul : GenXIntrinsic::genx_sumul)
+               : (S1 ? GenXIntrinsic::genx_usmul : GenXIntrinsic::genx_uumul);
+        Module *M = AInst->getParent()->getParent()->getParent();
+        Type *Tys[2] = {VTy, V0->getType()};
+        Function *Fn = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        Value *Vals[2] = {V0, V1};
+        CallInst *CI = Builder.CreateCall(Fn, Vals, "mul");
+        Srcs[2] = CI;
+      }
+    }
+  }
+
+  Value *Vals[3] = {Srcs[0], Srcs[1], Srcs[2]};
+
+  if (isa<BinaryOperator>(AInst)) {
+    ExtOperator *E0 = dyn_cast<ExtOperator>(Vals[0]);
+    ExtOperator *E1 = dyn_cast<ExtOperator>(Vals[1]);
+    if (E0 && E1 &&
+        E0->getOperand(0)->getType() == E1->getOperand(0)->getType()) {
+      if (E0->getOpcode() == Instruction::SExt) {
+        if (E1->getOpcode() == Instruction::SExt)
+          ID = GenXIntrinsic::genx_ssmad;
+        else
+          ID = GenXIntrinsic::genx_sumad;
+      } else {
+        if (E1->getOpcode() == Instruction::SExt)
+          ID = GenXIntrinsic::genx_usmad;
+        else
+          ID = GenXIntrinsic::genx_uumad;
+      }
+      Vals[0] = E0->getOperand(0);
+      Vals[1] = E1->getOperand(0);
+    }
+  }
+
+  if (auto VTy = dyn_cast<VectorType>(Vals[2]->getType())) {
+    // Splat scalar sources if necessary.
+    for (unsigned i = 0; i != 2; ++i) {
+      Value *V = Vals[i];
+      if (V->getType()->isVectorTy())
+        continue;
+      if (auto C = dyn_cast<Constant>(V)) {
+        Vals[i] = ConstantVector::getSplat(VTy->getNumElements(), C);
+        continue;
+      }
+      auto Ext = dyn_cast<ExtOperator>(V);
+      if (Ext)
+        V = Ext->getOperand(0);
+      Type *NewTy = VectorType::get(V->getType(), 1);
+      V = Builder.CreateBitCast(V, NewTy);
+      // Broadcast through rdregin.
+      Region R(V);
+      R.Offset = 0;
+      R.Width = 1;
+      R.Stride = R.VStride = 0;
+      R.NumElements = VTy->getNumElements();
+      V = R.createRdRegion(V, ".splat", AInst, AInst->getDebugLoc());
+      if (Ext)
+        V = Builder.CreateCast(Instruction::CastOps(Ext->getOpcode()), V, VTy);
+      Vals[i] = V;
+    }
+  }
+
+  if (isLShift()) {
+    Type *Ty = Vals[0]->getType();
+    Constant *Base = ConstantInt::get(Ty->getScalarType(), 1);
+    if (Ty->isVectorTy())
+      Base = ConstantVector::getSplat(Ty->getVectorNumElements(), Base);
+    Vals[1] = Builder.CreateShl(Base, Vals[1]);
+  }
+
+  // Perform source operand negation if necessary.
+  if (NegIndex >= 0) {
+    if (AInst->getType()->isFPOrFPVectorTy())
+      Vals[NegIndex] = Builder.CreateFNeg(Vals[NegIndex], "fneg");
+    else
+      Vals[NegIndex] = Builder.CreateNeg(Vals[NegIndex], "neg");
+  }
+
+  Function *Fn = nullptr;
+  {
+    Module *M = AInst->getParent()->getParent()->getParent();
+    if (AInst->getType()->isFPOrFPVectorTy())
+      Fn = GenXIntrinsic::getAnyDeclaration(M, ID, AInst->getType());
+    else {
+      Type *Tys[2] = {AInst->getType(), Vals[0]->getType()};
+      Fn = GenXIntrinsic::getAnyDeclaration(M, ID, Tys);
+    }
+  }
+  CallInst *CI = Builder.CreateCall(Fn, Vals, "mad");
+  CI->setDebugLoc(AInst->getDebugLoc());
+  AInst->replaceAllUsesWith(CI);
+
+  NumOfMadMatched++;
+  return true;
+}
+
+
+
+bool MinMaxMatcher::valuesMatch(llvm::Value *Op1, llvm::Value *Op2) {
+  // Handle casts for instructions.
+  bool ZExt = false;
+  if (CastInst *CI = dyn_cast<CastInst>(Op1)) {
+    Op1 = CI->getOperand(0);
+    if (CI->getOpcode() == Instruction::ZExt)
+      ZExt = true;
+  }
+  if (CastInst *CI = dyn_cast<CastInst>(Op2)) {
+    Op2 = CI->getOperand(0);
+    if (CI->getOpcode() == Instruction::ZExt && !ZExt)
+      return false;
+  }
+
+  // the easy case - the operands match
+  if (Op1 == Op2)
+    return true;
+
+  // Handle constant zeros before data vectors.
+  if (isa<ConstantAggregateZero>(Op1) && isa<ConstantAggregateZero>(Op2)) {
+    ConstantAggregateZero *C1 = cast<ConstantAggregateZero>(Op1);
+    ConstantAggregateZero *C2 = cast<ConstantAggregateZero>(Op2);
+    if (C1->getNumElements() != C2->getNumElements())
+      return false;
+    Type *C1Ty = C1->getType();
+    Type *C2Ty = C2->getType();
+    if (C1Ty->isVectorTy()) {
+      C1Ty = C1Ty->getSequentialElementType();
+      C2Ty = C2Ty->getSequentialElementType();
+    }
+
+    return (C1Ty->isIntegerTy() && C2Ty->isIntegerTy()) ||
+           (C1Ty->isFloatingPointTy() && C2Ty->isFloatingPointTy());
+  }
+
+  // ConstantDataVectors aren't always matched as different instances are
+  // constructed containing the same values, so we'll compare the values to
+  // catch this case.
+  llvm::ConstantDataVector *C1 = dyn_cast<ConstantDataVector>(Op1);
+  llvm::ConstantDataVector *C2 = dyn_cast<ConstantDataVector>(Op2);
+  if (!C1 || !C2 || (C1->getNumElements() != C2->getNumElements()))
+    return false;
+
+  Type *C1Ty = C1->getElementType();
+  Type *C2Ty = C2->getElementType();
+  if (C1Ty->isIntegerTy() && C2Ty->isIntegerTy()) {
+    for (unsigned i = 0, e = C1->getNumElements(); i < e; ++i)
+      if (C1->getElementAsInteger(i) != C2->getElementAsInteger(i))
+        return false;
+    return true;
+  }
+
+  if (C1Ty->isFloatingPointTy() && C2Ty->isFloatingPointTy()) {
+    for (unsigned i = 0, e = C1->getNumElements(); i < e; ++i) {
+      double C1Val = C1Ty->isFloatTy() ? C1->getElementAsFloat(i)
+                                       : C1->getElementAsDouble(i);
+      double C2Val = C2Ty->isFloatTy() ? C2->getElementAsFloat(i)
+                                       : C2->getElementAsDouble(i);
+      if (C1Val != C2Val)
+        return false;
+    }
+    return true;
+  }
+
+  return false;
+}
+
+bool MinMaxMatcher::matchMinMax() {
+  assert(SelInst->getOpcode() == Instruction::Select && "expected SelectInst");
+  if ((CmpInst = dyn_cast<llvm::CmpInst>(SelInst->getOperand(0)))) {
+    Srcs[0] = SelInst->getOperand(1);
+    Srcs[1] = SelInst->getOperand(2);
+    CmpSrcs[0] = CmpInst->getOperand(0);
+    CmpSrcs[1] = CmpInst->getOperand(1);
+
+    bool Inverse = false;
+    if (valuesMatch(CmpSrcs[1], Srcs[0]) && valuesMatch(CmpSrcs[0], Srcs[1]))
+      Inverse = true;
+    else if (!(valuesMatch(CmpSrcs[0], Srcs[0]) &&
+               valuesMatch(CmpSrcs[1], Srcs[1])))
+      return false;
+
+    // We choose the min/max intrinsic based on the condition and whether the
+    // operand ordering is the same in the cmp and select.
+    switch (CmpInst->getPredicate()) {
+    default:
+      // this is not a candidate for min/max
+      return false;
+    case llvm::CmpInst::FCMP_OGE:
+    case llvm::CmpInst::FCMP_OGT:
+      if (Inverse) {
+        ID = GenXIntrinsic::genx_fmin;
+        Annotation = "min";
+      } else {
+        ID = GenXIntrinsic::genx_fmax;
+        Annotation = "max";
+      }
+      break;
+    case llvm::CmpInst::FCMP_OLE:
+    case llvm::CmpInst::FCMP_OLT:
+      if (Inverse) {
+        ID = GenXIntrinsic::genx_fmax;
+        Annotation = "max";
+      } else {
+        ID = GenXIntrinsic::genx_fmin;
+        Annotation = "min";
+      }
+      break;
+    case llvm::CmpInst::ICMP_SGE:
+    case llvm::CmpInst::ICMP_SGT:
+      if (Inverse) {
+        ID = GenXIntrinsic::genx_smin;
+        Annotation = "min";
+      } else {
+        ID = GenXIntrinsic::genx_smax;
+        Annotation = "max";
+      }
+      break;
+    case llvm::CmpInst::ICMP_SLE:
+    case llvm::CmpInst::ICMP_SLT:
+      if (Inverse) {
+        ID = GenXIntrinsic::genx_smax;
+        Annotation = "max";
+      } else {
+        ID = GenXIntrinsic::genx_smin;
+        Annotation = "min";
+      }
+      break;
+    case llvm::CmpInst::ICMP_UGE:
+    case llvm::CmpInst::ICMP_UGT:
+      if (Inverse) {
+        ID = GenXIntrinsic::genx_umin;
+        Annotation = "min";
+      } else {
+        ID = GenXIntrinsic::genx_umax;
+        Annotation = "max";
+      }
+      break;
+    case llvm::CmpInst::ICMP_ULE:
+    case llvm::CmpInst::ICMP_ULT:
+      if (Inverse) {
+        ID = GenXIntrinsic::genx_umax;
+        Annotation = "max";
+      } else {
+        ID = GenXIntrinsic::genx_umin;
+        Annotation = "min";
+      }
+      break;
+    }
+  }
+
+  // Emit min/max if matched
+  return emit();
+}
+
+bool MinMaxMatcher::emit() {
+  if ((ID == GenXIntrinsic::not_any_intrinsic) || (Srcs[0] == nullptr) ||
+      (Srcs[1] == nullptr))
+    return false;
+
+  IRBuilder<> Builder(SelInst);
+  Module *M = SelInst->getParent()->getParent()->getParent();
+  Type *Tys[2] = {SelInst->getType(), Srcs[0]->getType()};
+  Function *Fn = GenXIntrinsic::getAnyDeclaration(M, ID, Tys);
+  CallInst *CI = Builder.CreateCall(Fn, Srcs, Annotation);
+  CI->setDebugLoc(SelInst->getDebugLoc());
+  SelInst->replaceAllUsesWith(CI);
+
+  NumOfMinMaxMatched++;
+  return true;
+}
+
+// For a given instruction, find the insertion position which is the closest
+// to all the similar users to the specified reference user.
+static std::tuple<BasicBlock *, Instruction *>
+findOptimalInsertionPos(Instruction *I, Instruction *Ref, DominatorTree *DT,
+                        std::function<bool(Instruction *)> IsSimilar) {
+  assert(!isa<PHINode>(Ref) && "PHINode is not expected!");
+
+  // Shortcut case. If it's single-used, insert just before that user.
+  if (I->hasOneUse())
+    return std::make_tuple(nullptr, Ref);
+
+  DenseMap<BasicBlock *, Instruction *> BBs;
+  for (auto U : I->users()) {
+    Instruction *User = dyn_cast<Instruction>(U);
+    if (!User || !IsSimilar(User))
+      continue;
+    BasicBlock *UseBB = User->getParent();
+    DenseMap<BasicBlock *, Instruction *>::iterator MI;
+    bool New = false;
+    std::tie(MI, New) = BBs.insert(std::make_pair(UseBB, User));
+    if (New)
+      continue;
+    // Find the earliest user if they are in the same block.
+    BasicBlock::iterator BI = UseBB->begin();
+    for (; &*BI != User && &*BI != MI->second; ++BI)
+      /* EMPTY */;
+    MI->second = &*BI;
+  }
+
+  assert(BBs.size() != 0 && "Must find at least one BB!");
+
+  auto MI = BBs.begin();
+  // Another shortcut case. If it's only used in a single BB,
+  if (BBs.size() == 1)
+    return std::make_tuple(MI->first, MI->second);
+
+  BasicBlock *BB = MI->first;
+  for (++MI; MI != BBs.end(); ++MI)
+    BB = DT->findNearestCommonDominator(BB, MI->first);
+
+  MI = BBs.find(BB);
+  Instruction *Pos = nullptr;
+  if (MI != BBs.end()) {
+    BB = MI->first;
+    Pos = MI->second;
+  }
+  assert(BB);
+  return std::make_tuple(BB, Pos);
+}
+
+// For the specified constant, calculate its reciprocal if it's safe;
+// otherwise, return null.
+static Constant *getReciprocal(Constant *C, bool HasAllowReciprocal) {
+  assert(C->getType()->isFPOrFPVectorTy() &&
+         "Floating point value is expected!");
+
+  // TODO: remove this and use ConstantExpr::getFDiv.
+
+  // Reciprocal of undef can be undef.
+  if (isa<UndefValue>(C))
+    return C;
+
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+    // Compute the reciprocal of C.
+    const APFloat &Divisor = CFP->getValueAPF();
+    APFloat Rcp(Divisor.getSemantics(), 1U);
+    APFloat::opStatus Status =
+        Rcp.divide(Divisor, APFloat::rmNearestTiesToEven);
+    // Only fold it if it's safe.
+    if (Status == APFloat::opOK ||
+        (HasAllowReciprocal && Status == APFloat::opInexact))
+      return ConstantFP::get(C->getType()->getContext(), Rcp);
+    return nullptr;
+  }
+
+  VectorType *VTy = cast<VectorType>(C->getType());
+  IntegerType *ITy = Type::getInt32Ty(VTy->getContext());
+
+  SmallVector<Constant *, 16> Result;
+  for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
+    Constant *Elt =
+        ConstantExpr::getExtractElement(C, ConstantInt::get(ITy, i));
+    Constant *Rcp = getReciprocal(Elt, HasAllowReciprocal);
+    // Skip if any of elements fails to be folded as reciprocal.
+    if (!Rcp)
+      return nullptr;
+    Result.push_back(Rcp);
+  }
+  return ConstantVector::get(Result);
+}
+
+// For the given value, calculate its reciprocal and performance constant
+// folding if allowed.
+static Value *getReciprocal(IRBuilder<> &IRB, Value *V,
+                            bool HasAllowReciprocal = true) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return getReciprocal(C, HasAllowReciprocal);
+
+  if (!HasAllowReciprocal)
+    return nullptr;
+
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Twine Name = V->getName() + ".inv";
+  auto Func = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_inv,
+                                                V->getType());
+  auto Inv = IRB.CreateCall(Func, V, Name);
+  return Inv;
+}
+
+/// visitFDiv : reduce fdiv strength.
+///
+/// If fast-math is present, perform the following transforms:
+///
+/// (fdiv x, y)         -> (fmul x0, (fdiv 1., x1))
+/// (fdiv 1., x)        -> (rcp x)
+/// (fdiv 1., (sqrt x)) -> (rsqrt x)
+///
+/// Otherwise, try to reduce fdiv with constant divisor to fmul if the
+/// reciprocal is exact.
+///
+void GenXPatternMatch::visitFDiv(BinaryOperator &I) {
+  if (isInstructionTriviallyDead(&I)) {
+    // Clean up dead 'fdiv', which may be left due to the limitation of
+    // iterator used in instruction visitor, where only the instruction being
+    // visited could be safely erased/removed.
+    I.eraseFromParent();
+    Changed |= true;
+    return;
+  }
+
+  IRBuilder<> IRB(&I);
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  // Constant folding Op1 if it's safe.
+  if (Constant *C1 = dyn_cast<Constant>(Op1)) {
+    Constant *Rcp = getReciprocal(C1, I.hasAllowReciprocal());
+    if (!Rcp)
+      return;
+    IRB.setFastMathFlags(I.getFastMathFlags());
+    Value *FMul = IRB.CreateFMul(Op0, Rcp);
+    I.replaceAllUsesWith(FMul);
+    I.eraseFromParent();
+    Changed |= true;
+    return;
+  }
+
+  // Skip if reciprocal optimization is not allowed.
+  if (!I.hasAllowReciprocal())
+    return;
+
+  Instruction *Divisor = dyn_cast<Instruction>(Op1);
+  if (!Divisor)
+    return;
+
+  auto IsSimilar = [](Instruction *User) {
+    return User->getOpcode() == Instruction::FDiv && User->hasAllowReciprocal();
+  };
+
+  BasicBlock *BB = nullptr;
+  Instruction *Pos = nullptr;
+  std::tie(BB, Pos) = findOptimalInsertionPos(Divisor, &I, DT, IsSimilar);
+  if (Pos)
+    IRB.SetInsertPoint(Pos);
+  else
+    IRB.SetInsertPoint(BB);
+  auto Rcp = getReciprocal(IRB, Divisor);
+  cast<Instruction>(Rcp)->setDebugLoc(I.getDebugLoc());
+
+  for (auto U : Divisor->users()) {
+    Instruction *User = dyn_cast<Instruction>(U);
+    if (!User || User == Rcp || !IsSimilar(User))
+      continue;
+    Op0 = User->getOperand(0);
+    Value *NewVal = Rcp;
+    if (!match(Op0, m_FPOne())) {
+      IRB.SetInsertPoint(User);
+      IRB.setFastMathFlags(User->getFastMathFlags());
+      NewVal = IRB.CreateFMul(Op0, Rcp);
+    }
+    User->replaceAllUsesWith(NewVal);
+    // Skip removing dead instruction if it's the current instruction being
+    // visited as that might invalidate the iterator of this BB. These dead
+    // 'fdiv' will be removed when they are visited then.
+    if (User == &I)
+      User->eraseFromParent();
+  }
+  Changed |= true;
+  return;
+}
+
+namespace {
+
+class MulLike {
+public:
+  virtual ~MulLike() {}
+  static MulLike &get(Instruction *I);
+
+  virtual Instruction *getMul(Instruction *) const { return nullptr; }
+  virtual bool isAdd(User *) const { return false; }
+};
+
+class FPMulLike : public MulLike {
+public:
+  Instruction *getMul(Instruction *I) const override {
+    if (isa<FMulOperator>(I))
+      return I;
+    return nullptr;
+  }
+  bool isAdd(User *U) const override {
+    return isa<FAddOperator>(U) || isa<FSubOperator>(U);
+  }
+};
+
+class IntMulLike : public MulLike {
+public:
+  Instruction *getMul(Instruction *I) const override {
+    if (isa<MulOperator>(I) || isa<ShlOperator>(I))
+      return I;
+    return nullptr;
+  }
+  bool isAdd(User *U) const override {
+    if (isa<AddOperator>(U) || isa<SubOperator>(U))
+      return true;
+    if (CallInst *CI = dyn_cast<CallInst>(U)) {
+      switch (GenXIntrinsic::getGenXIntrinsicID(CI)) {
+      // Keep this list consistent with the one used for matchIntegerMad(IID).
+      case GenXIntrinsic::genx_ssadd_sat:
+      case GenXIntrinsic::genx_suadd_sat:
+      case GenXIntrinsic::genx_usadd_sat:
+      case GenXIntrinsic::genx_uuadd_sat:
+        return true;
+      default:
+        break;
+      }
+    }
+    return false;
+  }
+};
+
+MulLike &MulLike::get(Instruction *I) {
+  Type *Ty = I->getType()->getScalarType();
+  if (Ty->isFloatingPointTy()) {
+    static FPMulLike FPMul;
+    return FPMul;
+  }
+  if (Ty->isIntegerTy()) {
+    static IntMulLike IntMul;
+    return IntMul;
+  }
+  static MulLike Null;
+  return Null;
+}
+} // End anonymous namespace
+
+bool GenXPatternMatch::propagateFoldableRegion(Function *F) {
+  ReversePostOrderTraversal<Function *> RPOT(F);
+  bool Changed = false;
+  for (auto *BB : RPOT)
+    for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+      MulLike &Ring = MulLike::get(&*BI);
+      Instruction *Mul = Ring.getMul(&*BI);
+      if (!Mul)
+        continue;
+      // Traverse each wrregion use of mul.
+      for (auto *User : Mul->users()) {
+        if (!GenXIntrinsic::isWrRegion(User))
+          continue;
+        GenXIntrinsicInst *WII = cast<GenXIntrinsicInst>(User);
+        if (WII->getOperand(1) != Mul)
+          continue;
+        Region W(WII, BaleInfo());
+        Region V(Mul);
+        // TODO: Consider the broadcast and similar cases.
+        if (!W.isStrictlySimilar(V))
+          continue;
+        // Check if all rdregion usage could be folded.
+        SmallVector<GenXIntrinsicInst *, 16> Rds;
+        SmallVector<GenXIntrinsicInst *, 1> Wrs; // Assume just 1 live wrregion.
+        Wrs.push_back(WII);
+        bool HasUnsafeUse = false;
+        while (!HasUnsafeUse && !Wrs.empty()) {
+          GenXIntrinsicInst *II = Wrs.back();
+          Wrs.pop_back();
+          for (auto *U : II->users()) {
+            if (GenXIntrinsic::isRdRegion(U)) {
+              GenXIntrinsicInst *RII = cast<GenXIntrinsicInst>(U);
+              Region R(RII, BaleInfo());
+              if (R == W) {
+                for (auto *U2 : RII->users())
+                  if (!Ring.isAdd(U2)) {
+                    HasUnsafeUse = true;
+                    break;
+                  }
+                if (HasUnsafeUse)
+                  break;
+                Rds.push_back(RII);
+              } else if (R.overlap(W)) {
+                HasUnsafeUse = true;
+                break;
+              }
+            } else if (GenXIntrinsic::isWrRegion(U)) {
+              GenXIntrinsicInst *WII2 = cast<GenXIntrinsicInst>(U);
+              Region W2(WII2, BaleInfo());
+              if (W2 == W) {
+                // No more wrregion needs tracing. DO NOTHING.
+              } else if (W2.overlap(W)) {
+                HasUnsafeUse = true;
+                break;
+              } else // Otherwise, look over that non-overlapping wrregion.
+                Wrs.push_back(WII2);
+            } else {
+              HasUnsafeUse = true;
+              break;
+            }
+          }
+        }
+        // Skip if there is any unsafe use.
+        if (HasUnsafeUse)
+          continue;
+        auto *ScalarOrVectorMul = scalarizeOrVectorizeIfNeeded(Mul, Rds.begin(), Rds.end());
+        // Fold mul directly into its use after wrregion/rdregion pair.
+        for (auto *II : Rds) {
+          if (II->getType() != Mul->getType())
+            II->replaceAllUsesWith(ScalarOrVectorMul);
+          else
+            II->replaceAllUsesWith(Mul);
+          Changed = true;
+        }
+        // Collapse wrregion if there are rdregion folded away.
+        if (!Rds.empty()) {
+          WII->replaceAllUsesWith(WII->getArgOperand(0));
+          Changed = true;
+        }
+      }
+    }
+  return Changed;
+}
+
+// Simplify:
+//   %1 = zext i8 %0 to i32>
+//   %2 = bitcast i32 %2 to <32 x i1>
+//   %3 = call <8 x i1> @llvm.genx.rdpredregion.v8i1.v32i1(<32 x i1> %2, i32 0)
+// into
+//   %1 = bitcast i8 %0 to <8 x i1>
+//   RAUW %1
+//
+bool GenXPatternMatch::simplifyPredRegion(CallInst *CI) {
+  assert(GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_rdpredregion);
+  bool Changed = false;
+
+  unsigned NElts = CI->getType()->getVectorNumElements();
+  ConstantInt *C = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  assert(C && "constant integer expected");
+  unsigned Offset = (unsigned)C->getZExtValue();
+  assert(Offset % NElts == 0);
+
+  // The number of actual bits required.
+  unsigned NBits = NElts + Offset;
+  NBits = 1U << llvm::Log2_32_Ceil(NBits);
+
+  Value *Src = CI->getArgOperand(0);
+  Value *Input = nullptr;
+  if (match(Src, m_BitCast(m_ZExt(m_Value(Input))))) {
+    unsigned InputBits = Input->getType()->getPrimitiveSizeInBits();
+    if (NBits == InputBits) {
+      IRBuilder<> Builder(CI);
+      auto BC = Builder.CreateBitCast(Input, CI->getType(), "bitcast");
+      if (auto Inst = dyn_cast<Instruction>(BC))
+        Inst->setDebugLoc(CI->getDebugLoc());
+      CI->replaceAllUsesWith(BC);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool GenXPatternMatch::simplifyRdRegion(CallInst* Inst) {
+  assert(GenXIntrinsic::isRdRegion(Inst));
+  auto NewVTy = Inst->getType();
+  // rewrite indirect rdregion with constant offsets
+  auto R = Region::getWithOffset(Inst, false /*ParentWidth*/);
+  if (R.Indirect && R.IndirectIdx == 0 && R.IndirectAddrOffset == 0) {
+    int64_t starti = 0;
+    int64_t diffi = 0;
+    if (IsLinearVectorConstantInts(R.Indirect, starti, diffi)) {
+      R.Indirect = nullptr;
+      R.Width = NewVTy->getVectorNumElements();
+      R.Offset += starti;
+      R.Stride = (diffi * 8) / NewVTy->getVectorElementType()->getPrimitiveSizeInBits();
+      R.VStride = 0;
+      Value* OldV = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+      auto NewInst = R.createRdRegion(OldV, Inst->getName(),
+      Inst /*InsertBefore*/, Inst->getDebugLoc());
+      Inst->replaceAllUsesWith(NewInst);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool GenXPatternMatch::simplifyWrRegion(CallInst *Inst) {
+  assert(GenXIntrinsic::isWrRegion(Inst));
+  Value *NewV = Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+  Type *NewVTy = NewV->getType();
+
+  // Rewrite a single element insertion to undef as a region splat.
+  auto check1 = [=]() {
+    Value *OldV = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+    if (!isa<UndefValue>(OldV))
+      return false;
+    if (NewVTy->isVectorTy() && NewVTy->getVectorNumElements() > 1)
+      return false;
+    // Do not rewrite if input is another region read, as two region reads
+    // cannot be groupped into a single bale.
+    if (GenXIntrinsic::isRdRegion(NewV))
+      return false;
+    for (auto U : Inst->users()) {
+      if (auto BC = dyn_cast<BitCastInst>(U)) {
+        for (auto User : BC->users())
+          if (GenXIntrinsic::isWrRegion(User))
+            return false;
+      }
+
+      if (GenXIntrinsic::isWrRegion(U))
+        return false;
+    }
+
+    // OK, rewrite it!
+    return true;
+  };
+
+  if (check1()) {
+    if (!NewVTy->isVectorTy()) {
+      IRBuilder<> B(Inst);
+      NewV = B.CreateBitCast(NewV, VectorType::get(NewVTy, 1));
+    }
+    Region R(Inst->getType());
+    R.Width = R.NumElements;
+    R.Stride = 0;
+    NewV = R.createRdRegion(NewV, "splat", Inst, Inst->getDebugLoc(), false);
+    Inst->replaceAllUsesWith(NewV);
+    return true;
+  }
+
+  // rewrite indirect wrregion with constant offsets
+  auto R = Region::getWithOffset(Inst, false/*ParentWidth*/);
+  if (R.Indirect && R.IndirectIdx == 0 && R.IndirectAddrOffset == 0) {
+    int64_t starti = 0;
+    int64_t diffi = 0;
+    if (IsLinearVectorConstantInts(R.Indirect, starti, diffi)) {
+      R.Indirect = nullptr;
+      R.Width = NewVTy->getVectorNumElements();
+      R.Offset += starti;
+      R.Stride = (diffi * 8) / NewVTy->getVectorElementType()->getPrimitiveSizeInBits();
+      R.VStride = 0;
+      Value* OldV = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+      auto NewInst = R.createWrRegion(OldV, NewV, Inst->getName(),
+          Inst /*InsertBefore*/, Inst->getDebugLoc());
+      Inst->replaceAllUsesWith(NewInst);
+      return true;
+    }
+  }
+  return false;
+}
+
+// Simplify (trunc.sat (ext V)) to (trunc.sat V). Even if the source and
+// destination has the same type, it's incorrect to fold them into V directly
+// as the saturation is necessary.
+bool GenXPatternMatch::simplifyTruncSat(CallInst *Inst) {
+  assert(GenXIntrinsic::isIntegerSat(Inst) && "Unexpected integer saturation intrinsic!");
+
+  GenXIntrinsicInst *II = cast<GenXIntrinsicInst>(Inst);
+  ExtOperator *Ext = dyn_cast<ExtOperator>(Inst->getOperand(0));
+  if (!Ext)
+    return false;
+
+  auto IID = GenXIntrinsic::getGenXIntrinsicID(II);
+  Value *Src = Ext->getOperand(0);
+  bool isZExt = (Ext->getOpcode() == Instruction::ZExt);
+
+  switch (IID) {
+  case GenXIntrinsic::genx_sstrunc_sat:
+    IID = isZExt ? GenXIntrinsic::genx_sutrunc_sat
+                 : GenXIntrinsic::genx_sstrunc_sat;
+    break;
+  case GenXIntrinsic::genx_sutrunc_sat:
+    IID = isZExt ? GenXIntrinsic::genx_sutrunc_sat
+                 : GenXIntrinsic::genx_sstrunc_sat;
+    break;
+  case GenXIntrinsic::genx_ustrunc_sat:
+    IID = isZExt ? GenXIntrinsic::genx_uutrunc_sat
+                 : GenXIntrinsic::genx_ustrunc_sat;
+    break;
+  case GenXIntrinsic::genx_uutrunc_sat:
+    IID = isZExt ? GenXIntrinsic::genx_uutrunc_sat
+                 : GenXIntrinsic::genx_ustrunc_sat;
+    break;
+  default:
+    llvm_unreachable("Unknown intrinsic!");
+  }
+
+  Module *M = Inst->getParent()->getParent()->getParent();
+  Type *Tys[2] = {Inst->getType(), Src->getType()};
+  Function *Fn = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+
+  Inst->setCalledFunction(Fn);
+  Inst->setOperand(0, Src);
+
+  return true;
+}
+
+// Merge select into a write region if possible.
+//
+// a = rrd(x, R);               a = rrd(x, R)
+// c = a op b               ==> c = a op b
+// d = select p, c, a
+// wrr(x, d, R)                 wrr(x, c, R, p)
+//
+bool GenXPatternMatch::simplifySelect(Function *F) {
+  using namespace GenXIntrinsic::GenXRegion;
+
+  bool Changed = false;
+  for (auto &BB : *F) {
+    for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*empty*/) {
+      SelectInst *Inst = dyn_cast<SelectInst>(&*BI++);
+      if (!Inst || !Inst->hasOneUse() || !Inst->getType()->isVectorTy() ||
+          !Inst->getCondition()->getType()->isVectorTy())
+        continue;
+      if (!GenXIntrinsic::isWrRegion(Inst->user_back()))
+        continue;
+      CallInst *Wr = cast<CallInst>(Inst->user_back());
+      if (Wr->getOperand(NewValueOperandNum) != Inst)
+        continue;
+
+      auto match = [](Instruction *Wr, Value *V) -> bool {
+        if (!GenXIntrinsic::isRdRegion(V))
+          return false;
+        CallInst *Rd = cast<CallInst>(V);
+        if (Wr->getOperand(OldValueOperandNum) !=
+            Rd->getOperand(OldValueOperandNum))
+          return false;
+
+        Region WrReg(Wr, BaleInfo());
+        Region RdReg(Rd, BaleInfo());
+        if (WrReg != RdReg || WrReg.Indirect)
+          return false;
+
+        if (WrReg.Mask == nullptr)
+          return true;
+        if (auto C = dyn_cast<Constant>(WrReg.Mask))
+          if (C->isAllOnesValue())
+            return true;
+
+        return false;
+      };
+
+      for (int i = 1; i <= 2; ++i) {
+        Value *Op = Inst->getOperand(i);
+        if (match(Wr, Op)) {
+          Value *Mask = Inst->getCondition();
+          if (i == 1) {
+            IRBuilder<> B(Inst);
+            Mask = B.CreateNot(Mask, "not");
+          }
+
+          Region WrReg(Wr, BaleInfo());
+          WrReg.Mask = Mask;
+          Value *NewWr = WrReg.createWrRegion(
+              Wr->getOperand(OldValueOperandNum), Inst->getOperand(3 - i),
+              Wr->getName(), Wr, Wr->getDebugLoc());
+          Wr->replaceAllUsesWith(NewWr);
+          Changed = true;
+
+          if (Wr == &*BI)
+            ++BI;
+          Wr->eraseFromParent();
+          Inst->eraseFromParent();
+          break;
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
+// Perform volatile global related simplifications.
+bool GenXPatternMatch::simplifyVolatileGlobals(Function *F) {
+  bool Changed = false;
+  for (auto &BB : F->getBasicBlockList()) {
+    for (auto I = BB.begin(); I != BB.end(); /*empty*/) {
+      Instruction *Inst = &*I++;
+      if (isa<LoadInst>(Inst))
+        Changed |= normalizeGloads(Inst);
+    }
+    for (auto I = BB.rbegin(); I != BB.rend(); /*empty*/) {
+      Instruction *Inst = &*I++;
+      if (isInstructionTriviallyDead(Inst))
+        Inst->eraseFromParent();
+    }
+  }
+  return Changed;
+}
+
+// Decompose predicate operand for large vector selects.
+bool GenXPatternMatch::decomposeSelect(Function *F) {
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  const GenXSubtarget *ST = P ? P->getSubtarget() : nullptr;
+  SelectDecomposer SD(ST);
+  for (auto &BB : F->getBasicBlockList())
+    for (auto &Inst : BB.getInstList())
+      if (isa<SelectInst>(Inst))
+        SD.addStartSelect(&Inst);
+
+  return SD.run();
+}
+
+bool GenXPatternMatch::reassociateIntegerMad(Function *F) {
+  auto isSingleUsedAdd = [](Value *V) -> bool {
+    auto BO = dyn_cast<BinaryOperator>(V);
+    if (!BO || !BO->hasOneUse())
+      return false;
+    // FIXME: Consider 'sub' as well.
+    return BO->getOpcode() == Instruction::Add;
+  };
+
+  auto isSingleUsedMul = [](Value *V) -> bool {
+    auto BO = dyn_cast<BinaryOperator>(V);
+    if (!BO || !BO->hasOneUse())
+      return false;
+    return (BO->getOpcode() == Instruction::Mul ||
+            BO->getOpcode() == Instruction::Shl);
+  };
+
+  bool Changed = false;
+  for (auto &BB : *F) {
+    for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
+      if (!isSingleUsedAdd(&*BI)) {
+        ++BI;
+        continue;
+      }
+
+      auto BO = cast<BinaryOperator>(&*BI);
+      if (!isSingleUsedMul(BO->getOperand(0)) ||
+          !isSingleUsedMul(BO->getOperand(1))) {
+        ++BI;
+        continue;
+      }
+
+      // Found (a0 * b0) + (a1 * b1), track through the chain to check it is
+      //
+      //  (a0 * b0) + (a1 * b1) + ... + c
+      //
+      // and transform it into
+      //
+      //  c + (a0 * b0) + (a1 * b1) + ...
+      //
+      SmallVector<BinaryOperator *, 16> AccChain;
+      AccChain.push_back(BO);
+      bool Found = false;
+      unsigned OpndNo = 0;
+      while (!Found) {
+        Use &U = *BO->use_begin();
+        if (!isSingleUsedAdd(U.getUser()))
+          break;
+        BO = cast<BinaryOperator>(U.getUser());
+        if (BO->getParent() != &BB)
+          break;
+        if (!isSingleUsedMul(BO->getOperand(1 - U.getOperandNo()))) {
+          OpndNo = 1 - U.getOperandNo();
+          Found = true;
+        }
+        AccChain.push_back(BO);
+      }
+      if (!Found) {
+        ++BI;
+        continue;
+      }
+
+      BO = AccChain.back();
+      AccChain.pop_back();
+
+      IRBuilder<> IRB(BO);
+      // Reconstruct a new accumulation chain.
+      Instruction *Acc = cast<Instruction>(IRB.CreateAdd(
+          BO->getOperand(OpndNo), AccChain.front()->getOperand(0)));
+      OpndNo = 1;
+      for (auto CI = AccChain.begin(), CE = AccChain.end(); CI != CE; ++CI) {
+        auto BO2 = *CI;
+        Value *Opnd = BO2->getOperand(OpndNo);
+        Acc = cast<Instruction>(IRB.CreateAdd(Acc, Opnd));
+        Acc->setDebugLoc(BO2->getDebugLoc());
+        Use &U = *BO2->use_begin();
+        OpndNo = 1 - U.getOperandNo();
+      }
+      BO->replaceAllUsesWith(Acc);
+
+      // Erase old accumulation chain.
+      BI = std::next(BasicBlock::iterator(BO));
+      BO->eraseFromParent();
+      while (!AccChain.empty()) {
+        BO = AccChain.back();
+        AccChain.pop_back();
+        BI = std::next(BasicBlock::iterator(BO));
+        BO->eraseFromParent();
+      }
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+bool GenXPatternMatch::distributeIntegerMul(Function *F) {
+  bool Changed = false;
+  for (auto &BB : *F) {
+    for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
+      auto Mul = dyn_cast<MulOperator>(&*BI++);
+      if (!Mul || Mul->getType()->getScalarSizeInBits() < 32)
+        continue;
+      // Find the following pattern
+      //
+      //  A * (B + C) and all components are extended from 8-/16-bit integers.
+      //
+      // and transform it to
+      //
+      //  A * B + A * C.
+      //
+      // This transformation won't bring two much difference on SKL but could
+      // improve code quality a lot on platforms without multiplication of
+      // D * D -> D, e.g. CNL.
+      Value *LHS = Mul->getOperand(0);
+      Value *RHS = Mul->getOperand(1);
+      if (!isa<ExtOperator>(LHS))
+        std::swap(LHS, RHS);
+      // Skip if both LHS & RHS are not ext operators.
+      if (!isa<ExtOperator>(LHS))
+        continue;
+      // Skip if both LHS & RHS are already operands extended from narrow
+      // types.
+      if (isa<ExtOperator>(RHS))
+        continue;
+
+      auto collect = [](Value *V, SmallVectorImpl<Value *> &Ops) -> bool {
+        SmallVector<Value *, 32> CheckList;
+        CheckList.push_back(V);
+
+        while (!CheckList.empty()) {
+          V = CheckList.pop_back_val();
+          // Collect values if they are extended from narrow types.
+          if (isa<ExtOperator>(V)) {
+            Ops.push_back(V);
+            continue;
+          }
+          // FIXME: Add 'sub' support.
+          AddOperator *Add = dyn_cast<AddOperator>(V);
+          if (!Add || !Add->hasOneUse())
+            return true;
+          // DFT that 'add' tree.
+          CheckList.push_back(Add->getOperand(1));
+          CheckList.push_back(Add->getOperand(0));
+        }
+
+        return false;
+      };
+
+      SmallVector<Value *, 16> Ops;
+      if (collect(RHS, Ops))
+        continue;
+
+      assert(!Ops.empty() && "There's no operands collected!");
+
+      IRBuilder<> Builder(cast<Instruction>(Mul));
+      Value *Sum = nullptr;
+      for (auto V : Ops) {
+        Value *Prod = Builder.CreateMul(LHS, V);
+        if (!Sum)
+          Sum = Prod;
+        else
+          Sum = Builder.CreateAdd(Sum, Prod);
+      }
+      Mul->replaceAllUsesWith(Sum);
+      RecursivelyDeleteTriviallyDeadInstructions(Mul);
+
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+// The shift pattern:
+// V[0:7]   = ShtAmt[0]
+// V[8:15]  = ShtAmt[0] + ShtAmt[1]
+// V[16:23] = ShtAmt[0] + ShtAmt[2]
+// V[24:31] = ShtAmt[0] + ShtAmt[3]
+// where ShtAmt[0] is a constant vector and ShtAmt[i] are constant splats.
+static bool analyzeForShiftPattern(Constant *C,
+                                   SmallVectorImpl<Constant *> &ShtAmt,
+                                   const DataLayout &DL) {
+  unsigned Width = 8;
+  VectorType *VT = dyn_cast<VectorType>(C->getType());
+  if (!VT || VT->getVectorNumElements() <= Width ||
+      VT->getScalarSizeInBits() == 1)
+    return false;
+  unsigned NElts = VT->getVectorNumElements();
+  if (NElts % Width != 0)
+    return false;
+
+  SmallVector<Constant *, 8> Elts(Width, nullptr);
+  for (unsigned i = 0; i < Width; ++i) {
+    Constant *Elt = C->getAggregateElement(i);
+    if (isa<UndefValue>(Elt))
+      return false;
+    Elts[i] = Elt;
+  }
+  Constant *Base = ConstantVector::get(Elts);
+  ShtAmt.push_back(Base);
+
+  for (unsigned i = Width; i < NElts; i += Width) {
+    SmallVector<Constant *, 8> Elts(Width, nullptr);
+    for (unsigned j = 0; j < Width; ++j) {
+      Constant *Elt = C->getAggregateElement(i + j);
+      if (isa<UndefValue>(Elt))
+        return false;
+      Elts[j] = Elt;
+    }
+    unsigned Op = Base->getType()->isFPOrFPVectorTy() ? Instruction::FSub
+                                                      : Instruction::Sub;
+    Constant *A[] = {ConstantVector::get(Elts), Base};
+    auto X = ConstantFoldBinaryOpOperands(Op, A[0], A[1], DL);
+    if (!X)
+      return false;
+    if (!X->getSplatValue()) {
+      // This is not a splat and it is an integer vector.
+      if (!Base->getType()->isFPOrFPVectorTy())
+        return false;
+
+      // Check if A and B are within a few ULPs.
+      auto isWithinMaxULP = [](APFloat A, APFloat B, unsigned NSteps) {
+        APFloat::cmpResult cmpRes = A.compare(B);
+        if (cmpRes == APFloat::cmpEqual)
+          return true;
+        if (cmpRes == APFloat::cmpUnordered)
+          return false;
+
+        unsigned MAX_ULP = 3 * NSteps;
+        bool nextDown = cmpRes == APFloat::cmpGreaterThan;
+        for (unsigned i = 0; i < MAX_ULP; ++i) {
+          A.next(nextDown);
+          if (A.compare(B) == APFloat::cmpEqual)
+            return true;
+        }
+        return false;
+      };
+
+      // This is not an exact splat fp vector. We check if they are within a few
+      // ULPs, as divisions are actually not correctly rounded during folding.
+      ConstantFP *X0 = dyn_cast_or_null<ConstantFP>(X->getAggregateElement(0U));
+      if (!X0)
+        return false;
+      for (unsigned j = 1; j < Width; ++j) {
+        ConstantFP *Xj =
+            dyn_cast_or_null<ConstantFP>(X->getAggregateElement(j));
+        unsigned NSteps = NElts / Width;
+        if (!Xj ||
+            !isWithinMaxULP(Xj->getValueAPF(), X0->getValueAPF(), NSteps))
+          return false;
+      }
+      X = ConstantDataVector::getSplat(Width, X0);
+    }
+    ShtAmt.push_back(X);
+  }
+  return true;
+}
+
+bool GenXPatternMatch::vectorizeConstants(Function *F) {
+  bool Changed = false;
+  for (auto &BB : F->getBasicBlockList()) {
+    for (auto I = BB.begin(); I != BB.end();) {
+      Instruction *Inst = &*I++;
+      if (isa<PHINode>(Inst))
+        continue;
+      unsigned NumOpnds = Inst->getNumOperands();
+      auto CI = dyn_cast<CallInst>(Inst);
+      if (CI)
+        NumOpnds = CI->getNumArgOperands();
+      for (unsigned i = 0, e = NumOpnds; i != e; ++i) {
+        auto C = dyn_cast<Constant>(Inst->getOperand(i));
+        if (!C || isa<UndefValue>(C))
+          continue;
+        if (opMustBeConstant(Inst, i))
+          continue;
+        auto Ty = C->getType();
+        if (!Ty->isVectorTy() || Ty->getVectorNumElements() < 16 ||
+            C->getSplatValue())
+          continue;
+        SmallVector<Constant *, 8> ShtAmt;
+        if (analyzeForShiftPattern(C, ShtAmt, *DL)) {
+          // W1 = wrrregion(undef, ShtAmt[0], 0);
+          // V2 = fadd ShtAmt[0], ShtAmt[1]
+          // W2 = wrregion(W1, V2, Width)
+          // V3 = fadd ShtAmt[0], ShtAmt[2]
+          // W2 = wrregion(W2, V3, Width * 2)
+          // ...
+          Value *Base = nullptr;
+          {
+            Value *Args[] = {ShtAmt[0]};
+            Type *Tys[] = {ShtAmt[0]->getType()};
+            auto ID = C->getType()->isFPOrFPVectorTy()
+                          ? GenXIntrinsic::genx_constantf
+                          : GenXIntrinsic::genx_constanti;
+            Module *M = F->getParent();
+            Function *Decl = GenXIntrinsic::getGenXDeclaration(M, ID, Tys);
+            auto NewInst = CallInst::Create(Decl, Args, "constant", Inst);
+            NewInst->setDebugLoc(Inst->getDebugLoc());
+            Base = NewInst;
+          }
+
+          IRBuilder<> Builder(Inst);
+          unsigned Width = ShtAmt[0]->getType()->getVectorNumElements();
+          Region R(C->getType());
+          R.getSubregion(0, Width);
+          Value *Val = UndefValue::get(C->getType());
+          Val = R.createWrRegion(Val, Base, "", Inst, Inst->getDebugLoc());
+          for (unsigned j = 1; j < (unsigned)ShtAmt.size(); ++j) {
+            auto Opc = C->getType()->isFPOrFPVectorTy() ? Instruction::FAdd
+                                                        : Instruction::Add;
+            auto Input = Builder.CreateBinOp(Opc, Base, ShtAmt[j]);
+            Region R1(C->getType());
+            R1.getSubregion(Width * j, Width);
+            Val = R1.createWrRegion(Val, Input, "", Inst, Inst->getDebugLoc());
+          }
+
+          // Update this operand with newly vectorized constant.
+          auto ID = GenXIntrinsic::getGenXIntrinsicID(Inst);
+          if (ID == GenXIntrinsic::genx_constantf ||
+              ID == GenXIntrinsic::genx_constanti) {
+            Inst->replaceAllUsesWith(Val);
+            Inst->eraseFromParent();
+          } else
+            Inst->setOperand(i, Val);
+
+          Changed = true;
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
+static Instruction *insertConstantLoad(Constant *C, Instruction *InsertBefore) {
+  assert(!C->getType()->getScalarType()->isIntegerTy(1));
+  Value *Args[] = {C};
+  Type *Ty[] = {C->getType()};
+  auto IntrinsicID = GenXIntrinsic::genx_constanti;
+  if (C->getType()->isFPOrFPVectorTy())
+    IntrinsicID = GenXIntrinsic::genx_constantf;
+  Module *M = InsertBefore->getParent()->getParent()->getParent();
+  Function *F = GenXIntrinsic::getGenXDeclaration(M, IntrinsicID, Ty);
+  Instruction *Inst = CallInst::Create(F, Args, "constant", InsertBefore);
+  Inst->setDebugLoc(InsertBefore->getDebugLoc());
+  return Inst;
+}
+
+bool GenXPatternMatch::placeConstants(Function *F) {
+  bool Changed = false;
+  for (auto &BB : F->getBasicBlockList()) {
+    for (auto I = BB.begin(); I != BB.end();) {
+      Instruction *Inst = &*I++;
+      auto ID = GenXIntrinsic::getGenXIntrinsicID(Inst);
+      if (ID == GenXIntrinsic::genx_constantf ||
+          ID == GenXIntrinsic::genx_constanti)
+        continue;
+
+      for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) {
+        auto C = dyn_cast<Constant>(Inst->getOperand(i));
+        if (!C || isa<UndefValue>(C))
+          continue;
+        if (opMustBeConstant(Inst, i))
+          continue;
+        auto Ty = C->getType();
+        if (!Ty->isVectorTy() || C->getSplatValue())
+          continue;
+        if (Ty->getScalarSizeInBits() == 1)
+          continue;
+
+        // Counting the bit size of non-undef values.
+        unsigned NBits = 0;
+        for (unsigned i = 0, e = Ty->getVectorNumElements(); i != e; ++i) {
+          Constant *Elt = C->getAggregateElement(i);
+          if (Elt && !isa<UndefValue>(Elt))
+            NBits += Ty->getScalarSizeInBits();
+        }
+        if (NBits <= 256)
+          continue;
+
+        // Collect uses inside this function.
+        SmallVector<Use *, 8> ConstantUses;
+        std::set<Instruction *> ConstantUsers;
+
+        for (auto &U : C->uses()) {
+          auto I = dyn_cast<Instruction>(U.getUser());
+          if (!I || I->getParent()->getParent() != F)
+            continue;
+          ConstantUses.push_back(&U);
+          ConstantUsers.insert(I);
+        }
+        if (ConstantUsers.empty())
+          continue;
+
+        // Single use in a loop.
+        if (ConstantUsers.size() == 1) {
+          // Do not lift this constant, for now, to avoid spills.
+#if 0
+          Use *U = ConstantUses.back();
+          Instruction *UseInst = cast<Instruction>(U->getUser());
+          BasicBlock *UseBB = UseInst->getParent();
+          if (Loop *L = LI->getLoopFor(UseBB)) {
+            if (BasicBlock *Preheader = L->getLoopPreheader()) {
+              if (Preheader != UseBB) {
+                // Insert constant initialization in loop preheader.
+                Instruction *InsertBefore = Preheader->getTerminator();
+                Value *Val = insertConstantLoad(C, InsertBefore);
+                U->set(Val);
+                Changed = true;
+              }
+            }
+          }
+#endif
+          continue; // skip to the next constant
+        }
+
+        // It is profitable to use a common constant pool in register.
+        assert(ConstantUses.size() >= 2);
+        BasicBlock *InsertBB = nullptr;
+        for (auto U : ConstantUses) {
+          auto UseInst = cast<Instruction>(U->getUser());
+          auto UseBB = UseInst->getParent();
+          if (InsertBB == nullptr)
+            InsertBB = UseBB;
+          else if (InsertBB != UseBB) {
+            InsertBB = DT->findNearestCommonDominator(InsertBB, UseBB);
+          }
+        }
+
+        // InsertBlock is in a loop.
+        if (Loop *L = LI->getLoopFor(InsertBB))
+          if (BasicBlock *Preheader = L->getLoopPreheader())
+            if (Preheader != InsertBB)
+              InsertBB = Preheader;
+
+        // If the insert block is the same as some use block, find the first
+        // use instruction as the insert point. Otherwise, use the terminator of
+        // the insert block.
+        Instruction *InsertBefore = InsertBB->getTerminator();
+        for (auto UseInst : ConstantUsers) {
+          if (InsertBB == UseInst->getParent()) {
+            for (auto &I : InsertBB->getInstList()) {
+              if (ConstantUsers.find(&I) != ConstantUsers.end()) {
+                InsertBefore = &I;
+                goto Found;
+              }
+            }
+          }
+        }
+      Found:
+        assert(!isa<PHINode>(InsertBefore));
+        Value *Val = insertConstantLoad(C, InsertBefore);
+        for (auto U : ConstantUses)
+          U->set(Val);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool GenXPatternMatch::simplifyNullDst(CallInst *Inst) {
+  if (Inst->getNumUses() != 1)
+    return false;
+
+  PHINode *Phi = dyn_cast<PHINode>(Inst->use_begin()->getUser());
+  if (Phi == nullptr)
+    return false;
+
+  if (Phi->getNumUses() == 1 && Phi->use_begin()->getUser() == Inst) {
+    Phi->replaceAllUsesWith(UndefValue::get(Phi->getType()));
+    Phi->eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool canExtendMask(BinaryOperator *BO) {
+  Type *InstTy = BO->getType();
+  auto Op0 = dyn_cast<ConstantDataVector>(BO->getOperand(0));
+  auto Op1 = dyn_cast<ConstantDataVector>(BO->getOperand(1));
+  return InstTy->isVectorTy() &&
+         (InstTy->getScalarSizeInBits() == genx::ByteBits) && (Op0 || Op1);
+}
+
+bool GenXPatternMatch::extendMask(BinaryOperator *BO) {
+  if (!canExtendMask(BO))
+    return false;
+
+  Type *InstTy = BO->getType();
+  Type *I32Ty = Type::getInt32Ty(InstTy->getContext());
+  unsigned SizeInBits = InstTy->getScalarSizeInBits();
+  unsigned Scale = I32Ty->getPrimitiveSizeInBits() / SizeInBits;
+  unsigned NumElts = InstTy->getVectorNumElements();
+
+  // Cannot bitcast <N x iM> to <N/(32/M) x i32>
+  if (NumElts % Scale != 0)
+    return false;
+  NumElts /= Scale;
+
+  Type *NewTy = VectorType::get(I32Ty, NumElts);
+  IRBuilder<TargetFolder> Builder(BO->getParent(), BasicBlock::iterator(BO),
+                                  TargetFolder(*DL));
+  StringRef Name = BO->getName();
+
+  Value *Op0 =
+      Builder.CreateBitCast(BO->getOperand(0), NewTy, Name + ".extend.mask.op");
+  Value *Op1 =
+      Builder.CreateBitCast(BO->getOperand(1), NewTy, Name + ".extend.mask.op");
+  Value *NewAnd = Builder.CreateAnd(Op0, Op1, Name + ".extend.mask");
+  NewAnd = Builder.CreateBitCast(NewAnd, InstTy, Name + ".extend.mask.trunc");
+
+  BO->replaceAllUsesWith(NewAnd);
+
+  return true;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp
new file mode 100644
index 000000000000..608c60571d38
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp
@@ -0,0 +1,171 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXPostLegalization
+/// --------------------
+///
+/// GenXPostLegalization is a function pass run after legalization with the
+/// following purposes:
+///
+/// 1. It inserts a constant load for most constants that are not representable
+///    as a constant operand in GenX code. See the GenXConstants section below.
+//     (in the file GenXConstants.cpp)
+///
+/// 2. It calls GenXVectorDecomposer to perform vector decomposition. See the
+///    GenXVectorDecomposer section below.
+//     (in the file GenXVectorDecomposer.h)
+///
+/// Both of these things are done here because the results of them (constant
+/// loads and decomposed vector operations) may benefit from CSE run after
+/// this pass.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_POST_LEGALIZATION"
+
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXConstants.h"
+#include "GenXRegion.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "GenXVectorDecomposer.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+
+#include <set>
+
+using namespace llvm;
+using namespace genx;
+using namespace GenXIntrinsic::GenXRegion;
+
+namespace {
+
+// GenXPostLegalization : post-legalization pass
+class GenXPostLegalization : public FunctionPass {
+  DominatorTree *DT = nullptr;
+  VectorDecomposer VD;
+  const DataLayout *DL = nullptr;
+  const GenXSubtarget *ST = nullptr;
+public:
+  static char ID;
+  explicit GenXPostLegalization() : FunctionPass(ID) { }
+  virtual StringRef getPassName() const { return "GenX post-legalization pass"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunction(Function &F);
+};
+
+} // end namespace llvm
+
+
+char GenXPostLegalization::ID = 0;
+namespace llvm { void initializeGenXPostLegalizationPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXPostLegalization, "GenXPostLegalization", "GenXPostLegalization", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GenXPostLegalization, "GenXPostLegalization", "GenXPostLegalization", false, false)
+
+FunctionPass *llvm::createGenXPostLegalizationPass()
+{
+  initializeGenXPostLegalizationPass(*PassRegistry::getPassRegistry());
+  return new GenXPostLegalization;
+}
+
+void GenXPostLegalization::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.setPreservesCFG();
+}
+
+/***********************************************************************
+ * GenXPostLegalization::runOnFunction : process one function
+ */
+bool GenXPostLegalization::runOnFunction(Function &F)
+{
+  DL = &F.getParent()->getDataLayout();
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  if (P)
+    ST = P->getSubtarget();
+  else
+    return false;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  bool Modified = false;
+  Modified |= breakConstantExprs(&F);
+
+  for (Function::iterator fi = F.begin(), fe = F.end(); fi != fe; ++fi) {
+    BasicBlock *BB = &*fi;
+    for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) {
+      Instruction *Inst = &*bi;
+      switch (GenXIntrinsic::getAnyIntrinsicID(Inst)) {
+      default:
+        // Lower non-simple constant operands.
+        Modified |= loadNonSimpleConstants(Inst, nullptr, ST);
+        break;
+      case Intrinsic::fma:
+        Modified |= loadConstants(Inst, ST);
+        break;
+      }
+
+      // If this is a wrregion with constant input, or phi node input, give it
+      // to the vector decomposer. (We could just give it all wrregions, but we
+      // are trying to minimize the amount of work it has to do.)
+      if (!ST->disableVectorDecomposition()) {
+        if (GenXIntrinsic::isWrRegion(Inst)) {
+          if (isa<Constant>(Inst->getOperand(0)))
+            VD.addStartWrRegion(Inst);
+          else if (isa<PHINode>(Inst->getOperand(0)))
+            VD.addStartWrRegion(Inst);
+        }
+      }
+    }
+  }
+  // Run the vector decomposer for this function.
+  Modified |= VD.run(DT);
+  // Cleanup region reads and writes.
+  Modified |= simplifyRegionInsts(&F, DL);
+  // Cleanup redundant global loads.
+  Modified |= cleanupLoads(&F);
+  // Legalize constants in return.
+  for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    BasicBlock *BB = &*FI;
+    for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+      Instruction *Inst = &*BI;
+      if (isa<ReturnInst>(Inst)) {
+        Modified |= loadNonSimpleConstants(Inst, nullptr, ST);
+        Modified |= loadConstants(Inst, ST);
+      }
+    }
+  }
+
+  return Modified;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.cpp
new file mode 100644
index 000000000000..e32798ea829a
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.cpp
@@ -0,0 +1,211 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "GenXPressureTracker.h"
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXLiveness.h"
+#include "GenXRegion.h"
+#include "vc/GenXOpts/Utils/RegCategory.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+struct LiveRangeAndLength {
+  LiveRange *LR;
+  unsigned Length;
+  LiveRangeAndLength(LiveRange *LR, unsigned Length) : LR(LR), Length(Length) {}
+  bool operator<(const LiveRangeAndLength &Rhs) const {
+    return Length > Rhs.Length;
+  }
+};
+
+} // namespace
+
+unsigned PressureTracker::getSizeInBytes(LiveRange *LR, bool AllowWidening) {
+  SimpleValue SV = *LR->value_begin();
+  Value *V = SV.getValue();
+  Type *Ty = IndexFlattener::getElementType(V->getType(), SV.getIndex());
+  unsigned Bytes = (Ty->getPrimitiveSizeInBits() + 15U) / 8U & -2U;
+  if (!AllowWidening)
+    return Bytes;
+
+  // Check if this will be a live range to be promoted to a word vector:
+  // - this is a byte vector
+  // - non-of values will be used in indirect regions
+  // - all uses are in the same block (local variables only)
+  //
+  auto toWiden = [=]() -> bool {
+    if (!Ty->isVectorTy() || !Ty->getVectorElementType()->isIntegerTy(8))
+      return false;
+
+    BasicBlock *DefBB = nullptr;
+    for (auto I = LR->value_begin(), E = LR->value_end(); I != E; ++I) {
+      auto Inst = dyn_cast<Instruction>((*I).getValue());
+      if (!Inst)
+        return false;
+      if (!DefBB)
+        DefBB = Inst->getParent();
+      if (DefBB != Inst->getParent() || Inst->isUsedOutsideOfBlock(DefBB))
+        return false;
+      for (auto UI : Inst->users()) {
+        if (GenXIntrinsic::isRdRegion(UI) || GenXIntrinsic::isWrRegion(UI)) {
+          Region R(cast<Instruction>(UI), BaleInfo());
+          if (R.Indirect)
+            return false;
+        }
+      }
+    }
+
+    // OK, this is a candidate for widening.
+    return true;
+  };
+
+  if (toWiden()) {
+    WidenCandidates.push_back(LR);
+    Bytes *= 2;
+  }
+  return Bytes;
+}
+
+// Decrease pressure assuming no widening on variable for LR.
+void PressureTracker::decreasePressure(LiveRange *LR) {
+  if (!LR || LR->getCategory() != RegCategory::GENERAL)
+    return;
+
+#if _DEBUG
+  auto I = std::find(WidenCandidates.begin(), WidenCandidates.end(), LR);
+  assert(I != WidenCandidates.end());
+#endif
+
+  unsigned Bytes = getSizeInBytes(LR, /*AllowWidening*/ false);
+  for (auto SI = LR->begin(), SE = LR->end(); SI != SE; ++SI) {
+    for (unsigned i = SI->getStart(); i != SI->getEnd(); ++i) {
+      assert(i < Pressure.size());
+      assert(Pressure[i] >= Bytes);
+      Pressure[i] -= Bytes;
+    }
+  }
+  calculateRedSegments();
+}
+
+void PressureTracker::calculate() {
+  std::vector<LiveRange *> LRs;
+  getLiveRanges(LRs);
+  std::vector<LiveRangeAndLength> LRLs;
+  for (auto LR : LRs)
+    LRLs.emplace_back(LR, LR->getLength(/*WithWeak*/ false));
+  LRs.clear();
+  std::sort(LRLs.begin(), LRLs.end());
+
+  // Keep count of the rp at each instruction number.
+  Pressure.clear();
+  for (auto &I : LRLs) {
+    LiveRange *LR = I.LR;
+    unsigned Bytes = getSizeInBytes(LR, WithByteWidening);
+    for (auto SI = LR->begin(), SE = LR->end(); SI != SE; ++SI) {
+      if (SI->getEnd() >= Pressure.size())
+        Pressure.resize(SI->getEnd() + 1, 0);
+      for (unsigned i = SI->getStart(); i != SI->getEnd(); ++i)
+        Pressure[i] += Bytes;
+    }
+  }
+}
+
+// Calculate high pressure segments.
+void PressureTracker::calculateRedSegments() {
+  HighPressureSegments.clear();
+  unsigned UNDEF = std::numeric_limits<unsigned>::max();
+  unsigned B = UNDEF;
+  unsigned E = UNDEF;
+  for (unsigned i = 0; i < Pressure.size(); ++i) {
+    if (Pressure[i] >= THRESHOLD) {
+      if (B == UNDEF)
+        B = i;
+      else
+        E = i;
+    } else {
+      if (B != UNDEF && E != UNDEF)
+        HighPressureSegments.emplace_back(B, E);
+      else if (B != UNDEF)
+        HighPressureSegments.emplace_back(B, B);
+      B = E = UNDEF;
+    }
+  }
+}
+
+// Check if segment [B, E] intersects with a high pressure region or not.
+bool PressureTracker::intersectWithRedRegion(unsigned B, unsigned E) const {
+  for (auto S : HighPressureSegments) {
+    unsigned B1 = S.Begin;
+    unsigned E1 = S.End;
+    if (B > E1)
+      continue;
+    return E >= B1;
+  }
+  return false;
+}
+
+bool PressureTracker::intersectWithRedRegion(LiveRange *LR) const {
+  if (!LR || LR->getCategory() == RegCategory::NONE)
+    return false;
+  for (auto I = LR->begin(), E = LR->end(); I != E; ++I)
+    if (intersectWithRedRegion(I->getStart(), I->getEnd()))
+      return true;
+  return false;
+}
+
+void PressureTracker::getLiveRanges(std::vector<LiveRange *> &LRs) {
+  for (auto I = FG.begin(), E = FG.end(); I != E; ++I) {
+    Function *F = *I;
+    for (auto &Arg : F->args())
+      getLiveRangesForValue(&Arg, LRs);
+    if (I != FG.begin() && !F->getReturnType()->isVoidTy())
+      getLiveRangesForValue(Liveness->getUnifiedRet(F), LRs);
+    for (auto &BB : F->getBasicBlockList())
+      for (auto &Inst : BB.getInstList())
+        getLiveRangesForValue(&Inst, LRs);
+  }
+}
+
+void PressureTracker::getLiveRangesForValue(
+    Value *V, std::vector<LiveRange *> &LRs) const {
+  auto Ty = V->getType();
+  for (unsigned i = 0, e = IndexFlattener::getNumElements(Ty); i != e; ++i) {
+    SimpleValue SV(V, i);
+    LiveRange *LR = Liveness->getLiveRangeOrNull(SV);
+    if (!LR || LR->getCategory() == RegCategory::NONE)
+      continue;
+    // Only process an LR if the map iterator is on the value that appears
+    // first in the LR. That avoids processing the same LR multiple times.
+    if (SV != *LR->value_begin())
+      continue;
+    LRs.push_back(LR);
+  }
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.h
new file mode 100644
index 000000000000..00f922561c68
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.h
@@ -0,0 +1,91 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+#ifndef TARGET_GENX_PRESSURE_TRACKER_H
+#define TARGET_GENX_PRESSURE_TRACKER_H
+
+#include <vector>
+
+namespace llvm {
+
+class Value;
+class GenXLiveness;
+class FunctionGroup;
+
+namespace genx {
+
+class LiveRange;
+
+class PressureTracker {
+  FunctionGroup &FG;
+  GenXLiveness *Liveness;
+  // Flag to widen byte vectors to word vectors if applicable.
+  bool WithByteWidening;
+  // Candidate variable for widening.
+  std::vector<LiveRange *> WidenCandidates;
+  std::vector<unsigned> Pressure;
+
+  static const unsigned THRESHOLD = sizeof(float) * 8 * 120;
+  struct Segment {
+    unsigned Begin;
+    unsigned End;
+    Segment(unsigned B, unsigned E) : Begin(B), End(E) {}
+  };
+  std::vector<Segment> HighPressureSegments;
+
+public:
+  PressureTracker(FunctionGroup &FG, GenXLiveness *L,
+                  bool WithByteWidening = false)
+      : FG(FG), Liveness(L), WithByteWidening(WithByteWidening) {
+    calculate();
+    calculateRedSegments();
+  }
+
+  // Estimate the register pressure for each Instruction number.
+  void calculate();
+
+  // Calculate high pressure segments.
+  void calculateRedSegments();
+
+  // Check if segment [B, E] intersects with a high pressure region or not.
+  bool intersectWithRedRegion(unsigned B, unsigned E) const;
+  bool intersectWithRedRegion(LiveRange *LR) const;
+
+  // Return the list of variables that are likely to be widened.
+  const std::vector<LiveRange *> &getWidenVariables() { return WidenCandidates; }
+
+  // Decrease pressure assuming no widening on variable for LR.
+  void decreasePressure(LiveRange *LR);
+
+private:
+  void getLiveRanges(std::vector<LiveRange *> &LRs);
+  void getLiveRangesForValue(Value *V, std::vector<LiveRange *> &LRs) const;
+  unsigned getSizeInBytes(LiveRange *LR, bool AllowWidening);
+};
+
+} // namespace genx
+} // namespace llvm
+
+#endif  // TARGET_GENX_PRESSURE_TRACKER_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrinter.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrinter.cpp
new file mode 100644
index 000000000000..c458bae44d0f
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrinter.cpp
@@ -0,0 +1,243 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// GenXPrinter is a pass that prints the LLVM IR for a function, together
+// GenX specific analyses (instruction baling, liveness, register allocation).
+//
+//===----------------------------------------------------------------------===//
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXLiveness.h"
+#include "GenXNumbering.h"
+#include "GenXVisaRegAlloc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// GenXPrinter : an analysis to print a Function, with GenX specific analyses
+class GenXPrinter : public FunctionPass {
+  raw_ostream &OS;
+  const std::string Banner;
+public:
+  static char ID;
+  explicit GenXPrinter(raw_ostream &OS, const std::string &Banner)
+    : FunctionPass(ID), OS(OS), Banner(Banner) { }
+  virtual StringRef getPassName() const { return "GenX printer pass"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+  }
+  bool runOnFunction(Function &F);
+};
+
+// GenXGroupPrinter : an analysis to print a FunctionGroup, with GenX specific analyses
+class GenXGroupPrinter : public FunctionGroupPass {
+  raw_ostream &OS;
+  const std::string Banner;
+public:
+  static char ID;
+  explicit GenXGroupPrinter(raw_ostream &OS, const std::string &Banner)
+    : FunctionGroupPass(ID), OS(OS), Banner(Banner) { }
+  virtual StringRef getPassName() const { return "GenX FunctionGroup printer pass"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    FunctionGroupPass::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+  bool runOnFunctionGroup(FunctionGroup &FG);
+};
+
+} // end namespace llvm
+
+char GenXPrinter::ID = 0;
+
+FunctionPass *llvm::createGenXPrinterPass(raw_ostream &O, const std::string &Banner)
+{
+  return new GenXPrinter(O, Banner);
+}
+
+char GenXGroupPrinter::ID = 0;
+
+FunctionGroupPass *llvm::createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner)
+{
+  return new GenXGroupPrinter(O, Banner);
+}
+
+/***********************************************************************
+ * printFunction : print function with GenX analyses
+ */
+static void printFunction(raw_ostream &OS, Function &F, GenXBaling *Baling,
+    GenXLiveness *Liveness, GenXNumbering *Numbering, GenXVisaRegAlloc *RA)
+{
+  // This code is a downmarket version of AssemblyWriter::printFunction.
+  // We have our own version so we can show bales.
+  OS << "\ndefine ";
+  cast<FunctionType>(cast<PointerType>(F.getType())->getElementType())->getReturnType()->print(OS);
+  OS << " @" << F.getName() << "(";
+  for (Function::arg_iterator fb = F.arg_begin(), fi = fb, fe = F.arg_end();
+      fi != fe; ) {
+    if (fi != fb)
+      OS << ", ";
+    Argument *Arg = &*fi;
+    ++fi;
+    Arg->getType()->print(OS);
+    OS << " ";
+    // Only show register number if there is a register allocator.
+    GenXVisaRegAlloc::Reg* Reg = nullptr;
+    if (RA)
+      Reg = RA->getRegForValueOrNull(&F, SimpleValue(Arg));
+    if (Reg) {
+      OS << "[";
+      Reg->print(OS);
+      OS << "]";
+    }
+    OS << "%" << Arg->getName();
+  }
+  OS << ") {\n";
+  for (Function::iterator fi = F.begin(), fe = F.end(); fi != fe; ++fi) {
+    BasicBlock *BB = &*fi;
+    if (!BB->use_empty())
+      OS << BB->getName() << ":\n";
+    for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) {
+      Instruction *Inst = &*bi;
+      if (!Baling || !Baling->isBaled(Inst)) {
+        if (RA && !Inst->getType()->isVoidTy()) {
+          // Show allocated register in brackets. If it is struct type,
+          // we show the multiple registers. For an alias, show its base
+          // register in braces as well.
+          for (unsigned i = 0,
+              e = IndexFlattener::getNumElements(Inst->getType());
+              i != e; ++i) {
+            auto Reg = RA->getRegForValueOrNull(&F, SimpleValue(Inst, i));
+            if (Reg && Reg->Category) {
+              OS << (!i ? "[" : ",");
+              Reg->print(OS);
+              auto BaseReg = RA->getRegForValueUntyped(&F, SimpleValue(Inst, i));
+              if (BaseReg != Reg) {
+                OS << "{";
+                assert(BaseReg);
+                BaseReg->print(OS);
+                OS << "}";
+              }
+              if (i + 1 == e)
+                OS << "]";
+            }
+          }
+        }
+        // Show instruction number in brackets.
+        unsigned Num = 0;
+        if (Numbering)
+          Num = Numbering->getNumber(Inst);
+        if (Num)
+          OS << "[" << Num << "]";
+        if (!Baling) {
+          Inst->print(OS);
+          OS << "\n";
+        } else {
+          Bale B;
+          Baling->buildBale(Inst, &B);
+          if (B.size() == 1) {
+            Inst->print(OS);
+            OS << "\n";
+          } else {
+            OS << "  bale {\n";
+            for (Bale::iterator i = B.begin(),
+                e = B.end(); i != e; ++i) {
+              unsigned Num = 0;
+              if (Numbering)
+                Num = Numbering->getNumber(i->Inst);
+              if (Num)
+                OS << "[" << Num << "]";
+              OS << "   ";
+              i->Inst->print(OS);
+              switch (i->Info.Type) {
+                case BaleInfo::MAININST: break;
+                default: OS << " {" << i->Info.getTypeString() << "}"; break;
+              }
+              OS << "\n";
+            }
+            if (Num)
+              OS << "[" << Num << "]";
+            OS << "  }\n";
+          }
+        }
+      }
+    }
+  }
+  OS << "}\n";
+}
+
+/***********************************************************************
+ * GenXPrinter::runOnFunction : dump function with GenX analyses
+ */
+bool GenXPrinter::runOnFunction(Function &F)
+{
+  GenXVisaRegAlloc *RA = getAnalysisIfAvailable<GenXVisaRegAlloc>();
+  GenXLiveness *Liveness = nullptr;
+  GenXNumbering *Numbering = nullptr;
+  if (!RA) {
+    Liveness = getAnalysisIfAvailable<GenXLiveness>();
+    Numbering = getAnalysisIfAvailable<GenXNumbering>();
+  }
+  GenXBaling *Baling = getAnalysisIfAvailable<GenXFuncBaling>();
+  OS << Banner;
+  printFunction(OS, F, Baling, Liveness, Numbering, RA);
+  return false;
+}
+
+/***********************************************************************
+ * GenXGroupPrinter::runOnFunctionGroup : dump functions with GenX analyses
+ */
+bool GenXGroupPrinter::runOnFunctionGroup(FunctionGroup &FG)
+{
+  GenXVisaRegAlloc *RA = getAnalysisIfAvailable<GenXVisaRegAlloc>();
+  GenXLiveness *Liveness = nullptr;
+  GenXNumbering *Numbering = nullptr;
+  if (!RA) {
+    Liveness = getAnalysisIfAvailable<GenXLiveness>();
+    Numbering = getAnalysisIfAvailable<GenXNumbering>();
+  }
+  GenXBaling *Baling = getAnalysisIfAvailable<GenXGroupBaling>();
+  if (!Baling)
+    Baling = getAnalysisIfAvailable<GenXFuncBaling>();
+  OS << Banner;
+  if (Liveness)
+    OS << " (see below for GenXLiveness)";
+  for (auto i = FG.begin(), e = FG.end(); i != e; ++i)
+    printFunction(OS, **i, Baling, Liveness, Numbering, RA);
+  if (Liveness) {
+    Liveness->print(OS);
+    OS << "\n";
+  }
+  OS << "\n";
+  return false;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromoteArray.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromoteArray.cpp
new file mode 100644
index 000000000000..be5427272229
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromoteArray.cpp
@@ -0,0 +1,1081 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXPromoteArray
+/// --------------------
+///
+/// GenXPromoteArray is an optimization pass that converts load/store
+/// from an allocated private array into vector loads/stores followed by
+/// read-region and write-region.  Then we can apply standard llvm optimization
+/// to promote the entire array into virtual registers, and remove those
+/// loads and stores
+//===----------------------------------------------------------------------===//
+
+#include "GenX.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include <queue>
+
+#define MAX_ALLOCA_PROMOTE_GRF_NUM 96
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// Diagnostic information for error/warning relating array promotion.
+class DiagnosticInfoPromoteArray : public DiagnosticInfo {
+private:
+  std::string Description;
+
+public:
+  // Initialize from description
+  DiagnosticInfoPromoteArray(const Twine &Desc,
+                             DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfo(llvm::getNextAvailablePluginDiagnosticKind(), Severity),
+        Description(Desc.str()) {}
+
+  void print(DiagnosticPrinter &DP) const override {
+    DP << "GenXPromoteArray: " << Description;
+  }
+};
+
+class TransposeHelper {
+public:
+  TransposeHelper(bool vectorIndex, const llvm::DataLayout *DL,
+                  uint64_t baseTypeAllocSize)
+      : m_vectorIndex(vectorIndex), m_pDL(DL),
+        m_baseTypeAllocSize(baseTypeAllocSize) {}
+  void HandleAllocaSources(llvm::Instruction *v, llvm::Value *idx);
+  void handleGEPInst(llvm::GetElementPtrInst *pGEP, llvm::Value *idx);
+  void handlePHINode(llvm::PHINode *pPhi, llvm::Value *pScalarizedIdx,
+                     llvm::BasicBlock *pIncomingBB);
+  virtual void handleLoadInst(llvm::LoadInst *pLoad,
+                     llvm::Value *pScalarizedIdx) = 0;
+  virtual void handleStoreInst(llvm::StoreInst *pStore,
+                     llvm::Value *pScalarizedIdx) = 0;
+  virtual void handlePrivateGather(llvm::IntrinsicInst *pInst,
+                     llvm::Value *pScalarizedIdx) = 0;
+  virtual void handlePrivateScatter(llvm::IntrinsicInst *pInst,
+                     llvm::Value *pScalarizedIdx) = 0;
+  virtual void handleLLVMGather(llvm::IntrinsicInst *pInst,
+                     llvm::Value *pScalarizedIdx) = 0;
+  virtual void handleLLVMScatter(llvm::IntrinsicInst *pInst,
+                     llvm::Value *pScalarizedIdx) = 0;
+  void EraseDeadCode();
+
+private:
+  bool m_vectorIndex = false;
+  std::vector<llvm::Instruction *> m_toBeRemoved;
+  ValueMap<llvm::PHINode*, llvm::PHINode*> m_phiReplacement;
+
+protected:
+  const llvm::DataLayout *m_pDL = nullptr;
+  uint64_t m_baseTypeAllocSize = 0;
+};
+
+/// @brief  TransformPrivMem pass is used for lowering the allocas identified
+/// while visiting the alloca instructions
+///         and then inserting insert/extract elements instead of load stores.
+///         This allows us to store the data in registers instead of propagating
+///         it to scratch space.
+class TransformPrivMem : public llvm::FunctionPass,
+                         public llvm::InstVisitor<TransformPrivMem> {
+public:
+  TransformPrivMem();
+
+  ~TransformPrivMem() {}
+
+  virtual llvm::StringRef getPassName() const override {
+    return "TransformPrivMem";
+  }
+
+  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  virtual bool runOnFunction(llvm::Function &F) override;
+
+  void visitAllocaInst(llvm::AllocaInst &I);
+
+  void visitStore(llvm::StoreInst &St);
+
+  unsigned int extractAllocaSize(llvm::AllocaInst *pAlloca);
+
+private:
+  llvm::AllocaInst *createVectorForAlloca(llvm::AllocaInst *pAlloca,
+                                          llvm::Type *pBaseType);
+  void handleAllocaInst(llvm::AllocaInst *pAlloca);
+
+  bool CheckIfAllocaPromotable(llvm::AllocaInst *pAlloca);
+
+  bool replaceSingleAggrStore(llvm::StoreInst *StI);
+
+  bool replaceAggregatedStore(llvm::StoreInst *StI);
+
+public:
+  static char ID;
+
+private:
+  std::queue<StoreInst *> m_StoresToHandle;
+  const llvm::DataLayout *m_pDL = nullptr;
+  LLVMContext *m_ctx = nullptr;
+  std::vector<llvm::AllocaInst *> m_allocasToPrivMem;
+  llvm::Function *m_pFunc = nullptr;
+};
+} // namespace
+
+// Register pass to igc-opt
+namespace llvm {
+void initializeTransformPrivMemPass(PassRegistry &);
+}
+#define PASS_FLAG "transform-priv-mem"
+#define PASS_DESCRIPTION                                                       \
+  "transform private arrays for promoting them to registers"
+#define PASS_CFG_ONLY false
+#define PASS_ANALYSIS false
+INITIALIZE_PASS_BEGIN(TransformPrivMem, PASS_FLAG, PASS_DESCRIPTION,
+                      PASS_CFG_ONLY, PASS_ANALYSIS)
+INITIALIZE_PASS_END(TransformPrivMem, PASS_FLAG, PASS_DESCRIPTION,
+                    PASS_CFG_ONLY, PASS_ANALYSIS)
+
+char TransformPrivMem::ID = 0;
+
+FunctionPass *llvm::createTransformPrivMemPass() {
+  return new TransformPrivMem();
+}
+
+namespace {
+
+class TransposeHelperPromote : public TransposeHelper {
+public:
+  void handleLoadInst(LoadInst *pLoad, Value *pScalarizedIdx);
+  void handleStoreInst(StoreInst *pStore, Value *pScalarizedIdx);
+  void handlePrivateGather(IntrinsicInst *pInst, Value *pScalarizedIdx);
+  void handlePrivateScatter(IntrinsicInst *pInst, Value *pScalarizedIdx);
+  void handleLLVMGather(IntrinsicInst *pInst, Value *pScalarizedIdx);
+  void handleLLVMScatter(IntrinsicInst *pInst, Value *pScalarizedIdx);
+
+  AllocaInst *pVecAlloca;
+
+  TransposeHelperPromote(AllocaInst *pAI, const llvm::DataLayout *DL,
+                         uint64_t baseTypeAllocSize)
+      : TransposeHelper(false, DL, baseTypeAllocSize) {
+    pVecAlloca = pAI;
+  }
+};
+
+TransformPrivMem::TransformPrivMem() : FunctionPass(ID), m_pFunc(nullptr) {
+  initializeTransformPrivMemPass(*PassRegistry::getPassRegistry());
+}
+
+llvm::AllocaInst *
+TransformPrivMem::createVectorForAlloca(llvm::AllocaInst *pAlloca,
+                                        llvm::Type *pBaseType) {
+  IRBuilder<> IRB(pAlloca);
+
+  unsigned int totalSize = extractAllocaSize(pAlloca) /
+                           (unsigned int)(m_pDL->getTypeAllocSize(pBaseType));
+
+  llvm::VectorType *pVecType = llvm::VectorType::get(pBaseType, totalSize);
+
+  AllocaInst *pAllocaValue = IRB.CreateAlloca(pVecType);
+  return pAllocaValue;
+}
+
+bool TransformPrivMem::replaceSingleAggrStore(StoreInst *StI) {
+  IRBuilder<> Builder(StI);
+
+  Value *ValueOp = StI->getValueOperand();
+  Value *Ptr = StI->getPointerOperand();
+  unsigned AS = StI->getPointerAddressSpace();
+  Value *ValToStore = Builder.CreateExtractValue(ValueOp, 0);
+  ValToStore->setName(ValueOp->getName() + ".noAggr");
+
+  StoreInst *NewStI = Builder.CreateAlignedStore(ValToStore,
+    Builder.CreateBitCast(Ptr, ValToStore->getType()->getPointerTo(AS)),
+    StI->getAlignment(), StI->isVolatile());
+  m_StoresToHandle.push(NewStI);
+  StI->eraseFromParent();
+
+  return true;
+}
+
+bool TransformPrivMem::replaceAggregatedStore(StoreInst *StI) {
+  IRBuilder<> Builder(StI);
+  Value *ValueOp = StI->getValueOperand();
+  Type *ValueOpTy = ValueOp->getType();
+  auto *ST = dyn_cast<StructType>(ValueOpTy);
+  auto *AT = dyn_cast<ArrayType>(ValueOpTy);
+
+  assert(StI->isSimple());
+  assert(AT || ST);
+
+  uint64_t Count = ST ? ST->getNumElements() : AT->getNumElements();
+  if (Count == 1) {
+    return replaceSingleAggrStore(StI);
+  }
+
+  auto *IdxType = Type::getInt32Ty(*m_ctx);
+  auto *Zero = ConstantInt::get(IdxType, 0);
+  for (uint64_t i = 0; i < Count; ++i) {
+    Value *Indices[2] = {
+      Zero,
+      ConstantInt::get(IdxType, i)
+    };
+
+    Value *Ptr = nullptr;
+    auto *PtrOp = StI->getPointerOperand();
+    if (ST) {
+      Ptr = Builder.CreateInBoundsGEP(ST,
+        PtrOp, makeArrayRef(Indices));
+    } else {
+      Ptr = Builder.CreateInBoundsGEP(AT,
+        PtrOp, makeArrayRef(Indices));
+    }
+    Ptr->setName(PtrOp->getName() + ".noAggrGEP");
+    auto *Val = Builder.CreateExtractValue(ValueOp, i);
+    Val->setName(ValueOp->getName() + ".noAggr");
+    StoreInst *NewStI = Builder.CreateStore(Val, Ptr, StI->isVolatile());
+
+    m_StoresToHandle.push(NewStI);
+  }
+
+  StI->eraseFromParent();
+
+  return true;
+}
+
+bool TransformPrivMem::runOnFunction(llvm::Function &F) {
+  m_pFunc = &F;
+  m_ctx = &(m_pFunc->getContext());
+
+  m_pDL = &F.getParent()->getDataLayout();
+  m_allocasToPrivMem.clear();
+
+  visit(F);
+
+  bool AggrRemoved = false;
+  while (!m_StoresToHandle.empty()) {
+    StoreInst *StI = m_StoresToHandle.front();
+    m_StoresToHandle.pop();
+    if (StI->getValueOperand()->getType()->isAggregateType())
+      AggrRemoved |= replaceAggregatedStore(StI);
+  }
+
+  std::vector<llvm::AllocaInst *> &allocaToHandle = m_allocasToPrivMem;
+
+  for (auto pAlloca : allocaToHandle) {
+    handleAllocaInst(pAlloca);
+  }
+
+  // Last remove alloca instructions
+  for (auto pInst : allocaToHandle) {
+    if (pInst->use_empty()) {
+      pInst->eraseFromParent();
+    }
+  }
+  // IR changed only if we had alloca instruction to optimize or
+  // if aggregated stores were replaced
+  return !allocaToHandle.empty() || AggrRemoved;
+}
+
+unsigned int TransformPrivMem::extractAllocaSize(llvm::AllocaInst *pAlloca) {
+  unsigned int arraySize =
+      (unsigned int)(cast<ConstantInt>(pAlloca->getArraySize())
+                         ->getZExtValue());
+  unsigned int totalArrayStructureSize =
+      (unsigned int)(m_pDL->getTypeAllocSize(pAlloca->getAllocatedType()) *
+                     arraySize);
+
+  return totalArrayStructureSize;
+}
+
+static Type *GetBaseType(Type *pType, Type *pBaseType) {
+  while (pType->isStructTy() || pType->isArrayTy() || pType->isVectorTy()) {
+    if (pType->isStructTy()) {
+      int num_elements = pType->getStructNumElements();
+      for (int i = 0; i < num_elements; ++i) {
+        Type *structElemBaseType =
+            GetBaseType(pType->getStructElementType(i), pBaseType);
+        // can support only homogeneous structures
+        if (pBaseType != nullptr &&
+            (structElemBaseType == nullptr ||
+             structElemBaseType->getTypeID() != pBaseType->getTypeID()))
+          return nullptr;
+        pBaseType = structElemBaseType;
+      }
+      return pBaseType;
+    } else if (pType->isArrayTy()) {
+      pType = pType->getArrayElementType();
+    } else if (pType->isVectorTy()) {
+      pType = pType->getVectorElementType();
+    } else {
+      assert(0);
+    }
+  }
+  if (pType->isPointerTy() && pType->getPointerElementType()->isFunctionTy())
+    pType = IntegerType::getInt8Ty(pType->getContext());
+  return pType;
+}
+
+static bool CheckAllocaUsesInternal(Instruction *I) {
+  for (Value::user_iterator use_it = I->user_begin(), use_e = I->user_end();
+       use_it != use_e; ++use_it) {
+    if (GetElementPtrInst *gep = dyn_cast<GetElementPtrInst>(*use_it)) {
+      auto PtrV = gep->getPointerOperand();
+      // we cannot support a vector of pointers as the base of the GEP
+      if (PtrV->getType()->isPointerTy()) {
+        if (CheckAllocaUsesInternal(gep))
+          continue;
+      }
+      return false;
+    }
+    if (llvm::LoadInst *pLoad = llvm::dyn_cast<llvm::LoadInst>(*use_it)) {
+      if (!pLoad->isSimple())
+        return false;
+    } else if (llvm::StoreInst *pStore =
+                   llvm::dyn_cast<llvm::StoreInst>(*use_it)) {
+      if (!pStore->isSimple())
+        return false;
+      llvm::Value *pValueOp = pStore->getValueOperand();
+      if (pValueOp == I) {
+        // GEP instruction is the stored value of the StoreInst (not supported
+        // case)
+        return false;
+      }
+    } else if (llvm::BitCastInst *pBitCast =
+                   llvm::dyn_cast<llvm::BitCastInst>(*use_it)) {
+      if (pBitCast->use_empty())
+        continue;
+      Type *baseT =
+          GetBaseType(pBitCast->getType()->getPointerElementType(), nullptr);
+      Type *sourceType = GetBaseType(
+          pBitCast->getOperand(0)->getType()->getPointerElementType(), nullptr);
+      assert(sourceType);
+      // either the point-to-element-type is the same or 
+      // the point-to-element-type is the byte or a function pointer
+      if (baseT != nullptr &&
+          (baseT->getScalarSizeInBits() == 8 ||
+           baseT->getScalarSizeInBits() == sourceType->getScalarSizeInBits() ||
+           (baseT->isPointerTy() &&
+            baseT->getPointerElementType()->isFunctionTy()))) {
+        if (CheckAllocaUsesInternal(pBitCast))
+          continue;
+      }
+      // Not a candidate.
+      return false;
+    } else if (IntrinsicInst *intr = dyn_cast<IntrinsicInst>(*use_it)) {
+      auto IID = GenXIntrinsic::getAnyIntrinsicID(intr);
+      if (IID == llvm::Intrinsic::lifetime_start ||
+          IID == llvm::Intrinsic::lifetime_end ||
+          IID == GenXIntrinsic::genx_gather_private ||
+          IID == GenXIntrinsic::genx_scatter_private ||
+          IID == llvm::Intrinsic::masked_gather ||
+          IID == llvm::Intrinsic::masked_scatter) {
+        continue;
+      }
+      return false;
+    } else if (PHINode *phi = dyn_cast<PHINode>(*use_it)) {
+      // Only GEPs with same base and bitcasts with same src yet supported
+      Value *pPtrOp = nullptr;
+      if (auto BC = dyn_cast<BitCastInst>(I))
+        pPtrOp = BC->getOperand(0);
+      else if (auto GEP = dyn_cast<GetElementPtrInst>(I))
+        pPtrOp = GEP->getPointerOperand();
+      else
+        return false;
+
+      if (all_of(phi->incoming_values(), [&](Value *V) {
+            if (auto GEP = dyn_cast<GetElementPtrInst>(V))
+              return GEP->getPointerOperand() == pPtrOp;
+            else if (auto BC = dyn_cast<BitCastInst>(V))
+              return BC->getOperand(0) == pPtrOp;
+            return false;
+          }))
+        if (CheckAllocaUsesInternal(phi))
+          continue;
+      // Not a candidate.
+      return false;
+    } else {
+      // This is some other instruction. Right now we don't want to handle these
+      return false;
+    }
+  }
+  return true;
+}
+
+bool TransformPrivMem::CheckIfAllocaPromotable(llvm::AllocaInst *pAlloca) {
+  unsigned int allocaSize = extractAllocaSize(pAlloca);
+  unsigned int allowedAllocaSizeInBytes = MAX_ALLOCA_PROMOTE_GRF_NUM * 32;
+
+  // if alloca size exceeds alloc size threshold, emit warning
+  // and discard promotion
+  if (allocaSize > allowedAllocaSizeInBytes) {
+    DiagnosticInfoPromoteArray Warn(
+        m_pFunc->getName() + " allocation size is too big: using TPM",
+        DS_Warning);
+    m_pFunc->getContext().diagnose(Warn);
+    return false;
+  }
+
+  // Don't even look at non-array or non-struct allocas.
+  // (extractAllocaDim can not handle them anyway, causing a crash)
+  llvm::Type *pType = pAlloca->getAllocatedType();
+  if ((!pType->isStructTy() && !pType->isArrayTy() && !pType->isVectorTy()) ||
+      pAlloca->isArrayAllocation())
+    return false;
+
+  Type *baseType = GetBaseType(pType, nullptr);
+  if (baseType == nullptr)
+    return false;
+  auto Ty = baseType->getScalarType();
+  // only handle case with a simple base type
+  if (!(Ty->isFloatingPointTy() || Ty->isIntegerTy()) &&
+      !(Ty->isPointerTy() && Ty->getPointerElementType()->isFunctionTy()))
+    return false;
+
+  return CheckAllocaUsesInternal(pAlloca);
+}
+
+void TransformPrivMem::visitStore(StoreInst &I) {
+  if (I.getValueOperand()->getType()->isAggregateType())
+    m_StoresToHandle.push(&I);
+}
+
+void TransformPrivMem::visitAllocaInst(AllocaInst &I) {
+  // find those allocas that can be promoted as a whole-vector
+  if (!CheckIfAllocaPromotable(&I)) {
+    return;
+  }
+  m_allocasToPrivMem.push_back(&I);
+}
+
+void TransformPrivMem::handleAllocaInst(llvm::AllocaInst *pAlloca) {
+  // Extract the Alloca size and the base Type
+  Type *pType = pAlloca->getType()->getPointerElementType();
+  Type *pBaseType = GetBaseType(pType, nullptr);
+  if (!pBaseType)
+    return;
+  pBaseType = pBaseType->getScalarType();
+  llvm::AllocaInst *pVecAlloca = createVectorForAlloca(pAlloca, pBaseType);
+  if (!pVecAlloca)
+    return;
+  // skip processing of allocas that are already fine
+  if (pVecAlloca->getType() == pAlloca->getType())
+    return;
+
+  IRBuilder<> IRB(pVecAlloca);
+  Value *idx = IRB.getInt32(0);
+  TransposeHelperPromote helper(pVecAlloca, m_pDL,
+                                m_pDL->getTypeAllocSize(pBaseType));
+  helper.HandleAllocaSources(pAlloca, idx);
+  helper.EraseDeadCode();
+}
+
+void TransposeHelper::EraseDeadCode() {
+  for (Instruction *I : m_toBeRemoved)
+    I->dropAllReferences();
+  for (Instruction *I : m_toBeRemoved)
+    I->eraseFromParent();
+}
+
+void TransposeHelper::HandleAllocaSources(Instruction *v, Value *idx) {
+  SmallVector<Value *, 10> instructions;
+  for (Value::user_iterator it = v->user_begin(), e = v->user_end(); it != e;
+       ++it) {
+    Value *inst = cast<Value>(*it);
+    instructions.push_back(inst);
+  }
+
+  for (auto instruction : instructions) {
+    if (GetElementPtrInst *pGEP = dyn_cast<GetElementPtrInst>(instruction)) {
+      handleGEPInst(pGEP, idx);
+    } else if (BitCastInst *bitcast = dyn_cast<BitCastInst>(instruction)) {
+      m_toBeRemoved.push_back(bitcast);
+      Type *baseT =
+          GetBaseType(bitcast->getType()->getPointerElementType(), nullptr);
+      Type *sourceType = GetBaseType(
+          bitcast->getOperand(0)->getType()->getPointerElementType(), nullptr);
+      assert(baseT && sourceType);
+      // either the point-to-element-type is the same or
+      // the point-to-element-type is the byte
+      if (baseT->getScalarSizeInBits() == sourceType->getScalarSizeInBits())
+        HandleAllocaSources(bitcast, idx);
+      else if (baseT->isPointerTy() && baseT->getPointerElementType()->isFunctionTy())
+        HandleAllocaSources(bitcast, idx);
+      else {
+        assert(baseT->getScalarSizeInBits() == 8);
+        IRBuilder<> IRB(bitcast);
+        auto ElementSize =
+            sourceType->getScalarSizeInBits() / baseT->getScalarSizeInBits();
+        Value * Scale = nullptr;
+        if (idx->getType()->isVectorTy()) {
+          auto Width = idx->getType()->getVectorNumElements();
+          Scale = ConstantVector::getSplat(Width, IRB.getInt32(ElementSize));
+        }
+        else
+          Scale = IRB.getInt32(ElementSize);
+        auto NewIdx = IRB.CreateMul(idx, Scale);
+        HandleAllocaSources(bitcast, NewIdx);
+      }
+    } else if (StoreInst *pStore = llvm::dyn_cast<StoreInst>(instruction)) {
+      handleStoreInst(pStore, idx);
+    } else if (LoadInst *pLoad = llvm::dyn_cast<LoadInst>(instruction)) {
+      handleLoadInst(pLoad, idx);
+    } else if (PHINode *pPhi = llvm::dyn_cast<PHINode>(instruction)) {
+      handlePHINode(pPhi, idx, v->getParent());
+    } else if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(instruction)) {
+      auto IID = GenXIntrinsic::getAnyIntrinsicID(inst);
+      if (IID == llvm::Intrinsic::lifetime_start ||
+          IID == llvm::Intrinsic::lifetime_end)
+        inst->eraseFromParent();
+      else if (IID == GenXIntrinsic::genx_gather_private)
+        handlePrivateGather(inst, idx);
+      else if (IID == GenXIntrinsic::genx_scatter_private)
+        handlePrivateScatter(inst, idx);
+      else if (inst->getIntrinsicID() == llvm::Intrinsic::masked_gather)
+        handleLLVMGather(inst, idx);
+      else if (inst->getIntrinsicID() == llvm::Intrinsic::masked_scatter)
+        handleLLVMScatter(inst, idx);
+    }
+  }
+}
+
+
+void TransposeHelper::handleGEPInst(llvm::GetElementPtrInst *GEP,
+                                    llvm::Value *idx) {
+  m_toBeRemoved.push_back(GEP);
+  Value *PtrOp = GEP->getPointerOperand();
+  PointerType *PtrTy = dyn_cast<PointerType>(PtrOp->getType());
+  assert(PtrTy && "Only accept scalar pointer!");
+  int IdxWidth = 1;
+  for (auto OI = GEP->op_begin() + 1, E = GEP->op_end(); OI != E; ++OI) {
+    Value * Idx = *OI;
+    if (Idx->getType()->isVectorTy()) {
+      auto Width = Idx->getType()->getVectorNumElements();
+      if (Width > 1) {
+        if (IdxWidth <= 1)
+          IdxWidth = Width;
+        else
+          assert(IdxWidth == Width && "GEP has inconsistent vector-index width");
+      }
+    }
+  }
+  Type *Ty = PtrTy;
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  IRBuilder<> IRB(GEP);
+  Value * pScalarizedIdx = (IdxWidth == 1) ? IRB.getInt32(0) :
+  ConstantVector::getSplat(IdxWidth, IRB.getInt32(0));
+  for (auto OI = GEP->op_begin() + 1, E = GEP->op_end(); OI != E; ++OI, ++GTI) {
+    Value *Idx = *OI;
+    if (StructType *StTy = GTI.getStructTypeOrNull()) {
+      unsigned Field = unsigned(cast<ConstantInt>(Idx)->getZExtValue());
+      if (Field) {
+        Constant *OffsetVal =
+            IRB.getInt32(m_pDL->getStructLayout(StTy)->getElementOffset(Field) /
+                         m_baseTypeAllocSize);
+        if (IdxWidth > 1)
+          OffsetVal = ConstantVector::getSplat(IdxWidth, OffsetVal);
+        pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, OffsetVal);
+      }
+      Ty = StTy->getElementType(Field);
+    } else {
+      Ty = GTI.getIndexedType();
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (!CI->isZero()) {
+          Constant *OffsetVal =
+              IRB.getInt32(m_pDL->getTypeAllocSize(Ty) * CI->getZExtValue() /
+                           m_baseTypeAllocSize);
+          if (IdxWidth > 1)
+            OffsetVal = ConstantVector::getSplat(IdxWidth, OffsetVal);
+          pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, OffsetVal);
+        }
+      }
+      else if (!Idx->getType()->isVectorTy() && IdxWidth <= 1) {
+        Value *NewIdx = IRB.CreateZExtOrTrunc(Idx, IRB.getInt32Ty());
+        auto ElementSize = m_pDL->getTypeAllocSize(Ty) / m_baseTypeAllocSize;
+        NewIdx = IRB.CreateMul(NewIdx, IRB.getInt32(ElementSize));
+        pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, NewIdx);
+      } else {
+        // the input idx is a vector or the one of the GEP index is vector
+        Value * NewIdx = nullptr;
+        auto ElementSize = m_pDL->getTypeAllocSize(Ty) / m_baseTypeAllocSize;
+        if (Idx->getType()->isVectorTy()) {
+          assert(Idx->getType()->getVectorNumElements() == IdxWidth);
+          NewIdx = IRB.CreateZExtOrTrunc(Idx, pScalarizedIdx->getType());
+          NewIdx = IRB.CreateMul(NewIdx,
+            ConstantVector::getSplat(IdxWidth, IRB.getInt32(ElementSize)));
+        }
+        else {
+          Value * NewIdx = IRB.CreateZExtOrTrunc(Idx, IRB.getInt32Ty());
+          NewIdx = IRB.CreateMul(NewIdx, IRB.getInt32(ElementSize));
+          // splat the new-idx into a vector
+          NewIdx = IRB.CreateVectorSplat(IdxWidth, NewIdx);
+        }
+        pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, NewIdx);
+      }
+    }
+  }
+  if (!idx->getType()->isVectorTy() && IdxWidth <= 1) {
+    pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, idx);
+  }
+  else if (idx->getType()->isVectorTy()) {
+    assert(idx->getType()->getVectorNumElements() == IdxWidth);
+    pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, idx);
+  }
+  else {
+    auto SplatIdx = IRB.CreateVectorSplat(IdxWidth, idx);
+    pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, SplatIdx);
+  }
+  HandleAllocaSources(GEP, pScalarizedIdx);
+}
+
+// Pass acummulated idx through new phi
+void TransposeHelper::handlePHINode(PHINode *pPhi, Value *idx,
+                                    BasicBlock *pIncomingBB) {
+  PHINode *NewPhi = nullptr;
+  // If phi is not yet visited
+  if (!m_phiReplacement.count(pPhi)) {
+    IRBuilder<> IRB(pPhi);
+    NewPhi = IRB.CreatePHI(idx->getType(), pPhi->getNumIncomingValues(), "idx");
+    m_phiReplacement.insert(std::make_pair(pPhi, NewPhi));
+    m_toBeRemoved.push_back(pPhi);
+  } else
+    NewPhi = m_phiReplacement[pPhi];
+  NewPhi->addIncoming(idx, pIncomingBB);
+  HandleAllocaSources(pPhi, NewPhi);
+}
+
+void TransposeHelperPromote::handleLoadInst(LoadInst *pLoad,
+                                            Value *pScalarizedIdx) {
+  assert(pLoad->isSimple());
+  IRBuilder<> IRB(pLoad);
+  Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca);
+  auto LdTy = pLoad->getType()->getScalarType();
+  auto VETy = pLoadVecAlloca->getType()->getScalarType();
+  auto ReadIn = pLoadVecAlloca;
+  bool IsFuncPointer = pLoad->getPointerOperandType()->isPointerTy() &&
+    pLoad->getPointerOperandType()->getPointerElementType()->isPointerTy() &&
+    pLoad->getPointerOperandType()->getPointerElementType()->getPointerElementType()->isFunctionTy();
+  // do the type-casting if necessary
+  if (VETy != LdTy && !IsFuncPointer) {
+    auto VLen = pLoadVecAlloca->getType()->getVectorNumElements();
+    assert(VETy->getScalarSizeInBits() >= LdTy->getScalarSizeInBits());
+    assert((VETy->getScalarSizeInBits() % LdTy->getScalarSizeInBits()) == 0);
+    VLen = VLen * (VETy->getScalarSizeInBits() / LdTy->getScalarSizeInBits());
+    ReadIn = IRB.CreateBitCast(ReadIn, VectorType::get(LdTy, VLen));
+  }
+  if (IsFuncPointer) {
+    Region R(VectorType::get(
+                 pVecAlloca->getType()
+                     ->getPointerElementType()
+                     ->getVectorElementType(),
+                 m_pDL->getTypeSizeInBits(LdTy) /
+                     m_pDL->getTypeSizeInBits(pVecAlloca->getType()
+                                                  ->getPointerElementType()
+                                                  ->getVectorElementType())),
+             m_pDL);
+    if (!pScalarizedIdx->getType()->isIntegerTy(16)) {
+      pScalarizedIdx = IRB.CreateZExtOrTrunc(pScalarizedIdx, Type::getInt16Ty(pLoad->getContext()));
+    }
+    R.Indirect = pScalarizedIdx;
+    auto *Result = R.createRdRegion(pLoadVecAlloca, pLoad->getName(), pLoad,
+                                    pLoad->getDebugLoc(), true);
+    if (!Result->getType()->isPointerTy()) {
+      auto *BC =
+          IRB.CreateBitCast(Result, Type::getInt64Ty(pLoad->getContext()));
+      auto *PtrToI = IRB.CreateIntToPtr(BC, pLoad->getType(), pLoad->getName());
+      pLoad->replaceAllUsesWith(PtrToI);
+    } else
+      pLoad->replaceAllUsesWith(Result);
+  }
+  else if (pLoad->getType()->isVectorTy()) {
+    // A vector load
+    // %v = load <2 x float>* %ptr
+    // becomes
+    // %w = load <32 x float>* %ptr1
+    // %v0 = extractelement <32 x float> %w, i32 %idx
+    // %v1 = extractelement <32 x float> %w, i32 %idx+1
+    // replace all uses of %v with <%v0, %v1>
+    auto Len = pLoad->getType()->getVectorNumElements();
+    Value *Result = UndefValue::get(pLoad->getType());
+    for (unsigned i = 0; i < Len; ++i) {
+      Value *VectorIdx = ConstantInt::get(pScalarizedIdx->getType(), i);
+      auto Idx = IRB.CreateAdd(pScalarizedIdx, VectorIdx);
+      auto Val = IRB.CreateExtractElement(ReadIn, Idx);
+      Result = IRB.CreateInsertElement(Result, Val, VectorIdx);
+    }
+    pLoad->replaceAllUsesWith(Result);
+  } else {
+    auto Result = IRB.CreateExtractElement(ReadIn, pScalarizedIdx);
+    pLoad->replaceAllUsesWith(Result);
+  }
+  pLoad->eraseFromParent();
+}
+
+void TransposeHelperPromote::handleStoreInst(llvm::StoreInst *pStore,
+                                             llvm::Value *pScalarizedIdx) {
+  // Add Store instruction to remove list
+  assert(pStore->isSimple());
+  IRBuilder<> IRB(pStore);
+  llvm::Value *pStoreVal = pStore->getValueOperand();
+  llvm::Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca);
+  llvm::Value *WriteOut = pLoadVecAlloca;
+  auto StTy = pStoreVal->getType()->getScalarType();
+  auto VETy = pLoadVecAlloca->getType()->getScalarType();
+  // do the type-casting if necessary
+
+  bool IsFuncPointerStore =
+      (isFuncPointerVec(pStoreVal) ||
+       (pStoreVal->getType()->isPointerTy() &&
+        pStoreVal->getType()->getPointerElementType()->isFunctionTy()));
+  if (VETy != StTy && !IsFuncPointerStore) {
+    auto VLen = pLoadVecAlloca->getType()->getVectorNumElements();
+    assert(VETy->getScalarSizeInBits() >= StTy->getScalarSizeInBits());
+    assert((VETy->getScalarSizeInBits()%StTy->getScalarSizeInBits()) == 0);
+    VLen = VLen * (VETy->getScalarSizeInBits() / StTy->getScalarSizeInBits());
+    WriteOut = IRB.CreateBitCast(WriteOut, VectorType::get(StTy, VLen));
+  }
+  if (IsFuncPointerStore) {
+    auto *NewStoreVal = pStoreVal;
+    assert(pVecAlloca->getType()->getPointerElementType()->getVectorElementType()->isIntegerTy(8));
+    if (NewStoreVal->getType()->isPointerTy() &&
+        NewStoreVal->getType()->getPointerElementType()->isFunctionTy()) {
+      NewStoreVal = IRB.CreatePtrToInt(NewStoreVal, IntegerType::get(pStore->getContext(), 64));
+      NewStoreVal = IRB.CreateBitCast(NewStoreVal, VectorType::get(VETy, 8));
+    }
+    Region R(NewStoreVal, m_pDL);
+    if (!pScalarizedIdx->getType()->isIntegerTy(16)) {
+      pScalarizedIdx = IRB.CreateZExtOrTrunc(pScalarizedIdx, Type::getInt16Ty(pStore->getContext()));
+    }
+    R.Indirect = pScalarizedIdx;
+    WriteOut = R.createWrRegion(WriteOut, NewStoreVal, pStore->getName(), pStore,
+                     pStore->getDebugLoc());
+  } else if (pStoreVal->getType()->isVectorTy()) {
+    // A vector store
+    // store <2 x float> %v, <2 x float>* %ptr
+    // becomes
+    // %w = load <32 x float> *%ptr1
+    // %v0 = extractelement <2 x float> %v, i32 0
+    // %w0 = insertelement <32 x float> %w, float %v0, i32 %idx
+    // %v1 = extractelement <2 x float> %v, i32 1
+    // %w1 = insertelement <32 x float> %w0, float %v1, i32 %idx+1
+    // store <32 x float> %w1, <32 x float>* %ptr1
+    auto Len = pStoreVal->getType()->getVectorNumElements();
+    for (unsigned i = 0; i < Len; ++i) {
+      Value *VectorIdx = ConstantInt::get(pScalarizedIdx->getType(), i);
+      auto Val = IRB.CreateExtractElement(pStoreVal, VectorIdx);
+      auto Idx = IRB.CreateAdd(pScalarizedIdx, VectorIdx);
+      WriteOut = IRB.CreateInsertElement(WriteOut, Val, Idx);
+    }
+  } else {
+    WriteOut = IRB.CreateInsertElement(WriteOut, pStoreVal, pScalarizedIdx);
+  }
+  // cast the vector type back if necessary
+  if (VETy != StTy)
+    WriteOut = IRB.CreateBitCast(WriteOut, pLoadVecAlloca->getType());
+  IRB.CreateStore(WriteOut, pVecAlloca);
+  pStore->eraseFromParent();
+}
+
+void TransposeHelperPromote::handlePrivateGather(IntrinsicInst *pInst,
+                                          Value *pScalarizedIdx) {
+  IRBuilder<> IRB(pInst);
+  assert(pInst->getType()->isVectorTy());
+  Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca);
+  auto N = pInst->getType()->getVectorNumElements();
+  auto ElemType = pInst->getType()->getVectorElementType();
+
+  // A vector load
+  // %v = <2 x float> gather %pred, %ptr, %offset, %old_value
+  // becomes
+  // %w = load <32 x float>* %ptr1
+  // %v0 = <2 x float> rdregion <32 x float> %w, i32 %offsets, %stride
+  //
+  // replace all uses of %v with <%v0, %v1>
+  Region R(pInst);
+  int64_t v0 = 0;
+  int64_t diff = 0;
+  ConstantInt *CI = dyn_cast<ConstantInt>(pScalarizedIdx);
+  PointerType *GatherPtrTy =
+      dyn_cast<PointerType>(pInst->getArgOperand(1)->getType());
+  // pScalarizedIdx is an indice of element, so
+  // count byte offset depending on the type of pointer in gather
+  assert(GatherPtrTy);
+  unsigned GatherPtrNumBytes =
+      GatherPtrTy->getElementType()->getPrimitiveSizeInBits() / 8;
+  if (CI != nullptr &&
+      IsLinearVectorConstantInts(pInst->getArgOperand(2), v0, diff)) {
+    R.Indirect = nullptr;
+    R.Width = N;
+    int BytesOffset = CI->getSExtValue() * GatherPtrNumBytes;
+    R.Offset = v0 + BytesOffset;
+    R.Stride = (diff * 8) / ElemType->getPrimitiveSizeInBits();
+    R.VStride = 0;
+  } else {
+    auto OffsetType =
+        VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), N);
+    auto Offsets = IRB.CreateIntCast(pInst->getArgOperand(2), OffsetType, true);
+    auto Cast = IRB.CreateIntCast(
+        pScalarizedIdx, IntegerType::getInt16Ty(pInst->getContext()), true);
+    auto Scale = IRB.CreateMul(IRB.getInt16(GatherPtrNumBytes), Cast);
+    auto vec = VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), 1);
+    auto GEPOffsets =
+        IRB.CreateInsertElement(UndefValue::get(vec), Scale, IRB.getInt32(0));
+    GEPOffsets = IRB.CreateShuffleVector(
+        GEPOffsets, UndefValue::get(vec),
+        ConstantAggregateZero::get(
+            VectorType::get(IntegerType::getInt32Ty(pInst->getContext()), N)));
+    Offsets = IRB.CreateAdd(GEPOffsets, Offsets);
+    R.Indirect = Offsets;
+    R.Width = 1;
+    R.Stride = 0;
+    R.VStride = 0;
+  }
+  Value *Result =
+      R.createRdRegion(pLoadVecAlloca, pInst->getName(), pInst /*InsertBefore*/,
+                       pInst->getDebugLoc(), true /*AllowScalar*/);
+
+  // if old-value is not undefined and predicate is not all-one,
+  // create a select  auto OldVal = pInst->getArgOperand(3);
+  auto PredVal = pInst->getArgOperand(0);
+  bool PredAllOne = false;
+  if (auto C = dyn_cast<ConstantVector>(PredVal)) {
+    if (auto B = C->getSplatValue())
+      PredAllOne = B->isOneValue();
+  }
+  auto OldVal = pInst->getArgOperand(3);
+  if (!PredAllOne && !isa<UndefValue>(OldVal)) {
+    Result = IRB.CreateSelect(PredVal, Result, OldVal);
+  }
+
+  pInst->replaceAllUsesWith(Result);
+  pInst->eraseFromParent();
+}
+
+void TransposeHelperPromote::handlePrivateScatter(llvm::IntrinsicInst *pInst,
+                                           llvm::Value *pScalarizedIdx) {
+  // Add Store instruction to remove list
+  IRBuilder<> IRB(pInst);
+  llvm::Value *pStoreVal = pInst->getArgOperand(3);
+  llvm::Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca);
+  if (pStoreVal->getType()->isVectorTy() == false) {
+    assert(false);
+    return;
+  }
+  auto N = pStoreVal->getType()->getVectorNumElements();
+  auto ElemType = pStoreVal->getType()->getVectorElementType();
+  // A vector scatter
+  // scatter %pred, %ptr, %offset, %newvalue
+  // becomes
+  // %w = load <32 x float> *%ptr1
+  // %w1 = <32 x float> wrregion %w, newvalue, %offset, %pred
+  // store <32 x float> %w1, <32 x float>* %ptr1
+
+  // Create the new wrregion
+  Region R(pStoreVal);
+  int64_t v0 = 0;
+  int64_t diff = 0;
+  ConstantInt *CI = dyn_cast<ConstantInt>(pScalarizedIdx);
+  PointerType* ScatterPtrTy =
+	  dyn_cast<PointerType>(pInst->getArgOperand(1)->getType());
+  // pScalarizedIdx is an indice of element, so
+  // count byte offset depending on the type of pointer in scatter
+  assert(ScatterPtrTy);
+  unsigned ScatterPtrNumBytes =
+      ScatterPtrTy->getElementType()->getPrimitiveSizeInBits() / 8;
+  if (CI != nullptr && IsLinearVectorConstantInts(pInst->getArgOperand(2), v0, diff)) {
+    R.Indirect = nullptr;
+    R.Width = N;
+    int BytesOffset = CI->getSExtValue() * ScatterPtrNumBytes;
+    R.Offset = v0 + BytesOffset;
+    R.Stride = (diff * 8) / ElemType->getPrimitiveSizeInBits();
+    R.VStride = 0;
+  } else {
+    auto OffsetType =
+        VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), N);
+    auto Offsets = IRB.CreateIntCast(pInst->getArgOperand(2), OffsetType, true);
+    auto Cast = IRB.CreateIntCast(
+        pScalarizedIdx, IntegerType::getInt16Ty(pInst->getContext()), true);
+    auto Scale = IRB.CreateMul(IRB.getInt16(ScatterPtrNumBytes), Cast);
+    auto vec = VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), 1);
+    auto GEPOffsets =
+        IRB.CreateInsertElement(UndefValue::get(vec), Scale, IRB.getInt32(0));
+    GEPOffsets = IRB.CreateShuffleVector(
+        GEPOffsets, UndefValue::get(vec),
+        ConstantAggregateZero::get(
+            VectorType::get(IntegerType::getInt32Ty(pInst->getContext()), N)));
+    Offsets = IRB.CreateAdd(GEPOffsets, Offsets);
+    R.Indirect = Offsets;
+    R.Width = 1;
+    R.Stride = 0;
+    R.VStride = 0;
+  }
+  R.Mask = pInst->getArgOperand(0);
+  auto NewInst = cast<Instruction>(
+      R.createWrRegion(pLoadVecAlloca, pStoreVal, pInst->getName(),
+                       pInst /*InsertBefore*/, pInst->getDebugLoc()));
+
+  IRB.CreateStore(NewInst, pVecAlloca);
+  pInst->eraseFromParent();
+}
+
+void TransposeHelperPromote::handleLLVMGather(IntrinsicInst *pInst,
+  Value *pScalarizedIdx) {
+  IRBuilder<> IRB(pInst);
+  assert(pInst->getType()->isVectorTy());
+  Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca);
+  auto N = pInst->getType()->getVectorNumElements();
+  auto ElemType = pInst->getType()->getVectorElementType();
+
+  // A vector load
+  // %v = <2 x float> gather %pred, %vector_of_ptr, %old_value
+  // becomes
+  // %w = load <32 x float>* %ptr1
+  // %v0 = <2 x float> rdregion <32 x float> %w, i32 %offsets, %stride
+  //
+  // replace all uses of %v with <%v0, %v1>
+  Region R(pInst);
+  int64_t v0 = 0;
+  int64_t diff = 0;
+  // count byte offset depending on the type of pointer in gather
+  unsigned ElemNumBytes = ElemType->getPrimitiveSizeInBits() / 8;
+  if (IsLinearVectorConstantInts(pScalarizedIdx, v0, diff)) {
+    R.Indirect = nullptr;
+    R.Width = N;
+    R.Offset = v0;
+    R.Stride = (diff * 8) / ElemType->getPrimitiveSizeInBits();
+    R.VStride = 0;
+  }
+  else {
+    auto OffsetType =
+      VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), N);
+    auto Offsets = IRB.CreateIntCast(pScalarizedIdx, OffsetType, false);
+    auto ScaleVec =
+      IRB.CreateInsertElement(UndefValue::get(OffsetType), IRB.getInt16(ElemNumBytes), IRB.getInt32(0));
+    ScaleVec = IRB.CreateShuffleVector(
+      ScaleVec, UndefValue::get(OffsetType),
+      ConstantAggregateZero::get(
+        VectorType::get(IntegerType::getInt32Ty(pInst->getContext()), N)));
+    Offsets = IRB.CreateMul(Offsets, ScaleVec);
+    R.Indirect = Offsets;
+    R.Width = 1;
+    R.Stride = 0;
+    R.VStride = 0;
+  }
+  Value *Result =
+    R.createRdRegion(pLoadVecAlloca, pInst->getName(), pInst /*InsertBefore*/,
+      pInst->getDebugLoc(), true /*AllowScalar*/);
+
+  // if old-value is not undefined and predicate is not all-one,
+  // create a select  auto OldVal = pInst->getArgOperand(3);
+  auto PredVal = pInst->getArgOperand(2);
+  bool PredAllOne = false;
+  if (auto C = dyn_cast<ConstantVector>(PredVal)) {
+    if (auto B = C->getSplatValue())
+      PredAllOne = B->isOneValue();
+  }
+  auto OldVal = pInst->getArgOperand(3);
+  if (!PredAllOne && !isa<UndefValue>(OldVal)) {
+    Result = IRB.CreateSelect(PredVal, Result, OldVal);
+  }
+
+  pInst->replaceAllUsesWith(Result);
+  pInst->eraseFromParent();
+}
+
+void TransposeHelperPromote::handleLLVMScatter(llvm::IntrinsicInst *pInst,
+  llvm::Value *pScalarizedIdx) {
+  // Add Store instruction to remove list
+  IRBuilder<> IRB(pInst);
+  llvm::Value *pStoreVal = pInst->getArgOperand(3);
+  llvm::Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca);
+  if (pStoreVal->getType()->isVectorTy() == false) {
+    assert(false);
+    return;
+  }
+  auto N = pStoreVal->getType()->getVectorNumElements();
+  auto ElemType = pStoreVal->getType()->getVectorElementType();
+  // A vector scatter
+  // scatter %pred, %ptr, %offset, %newvalue
+  // becomes
+  // %w = load <32 x float> *%ptr1
+  // %w1 = <32 x float> wrregion %w, newvalue, %offset, %pred
+  // store <32 x float> %w1, <32 x float>* %ptr1
+
+  // Create the new wrregion
+  Region R(pStoreVal);
+  int64_t v0 = 0;
+  int64_t diff = 0;
+  // pScalarizedIdx is an indice of element, so
+  // count byte offset depending on the type of pointer in scatter
+  unsigned ElemNumBytes = ElemType->getPrimitiveSizeInBits() / 8;
+  if (IsLinearVectorConstantInts(pScalarizedIdx, v0, diff)) {
+    R.Indirect = nullptr;
+    R.Width = N;
+    R.Offset = v0;
+    R.Stride = (diff * 8) / ElemType->getPrimitiveSizeInBits();
+    R.VStride = 0;
+  }
+  else {
+    auto OffsetType =
+      VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), N);
+    auto Offsets = IRB.CreateIntCast(pScalarizedIdx, OffsetType, false);
+    auto ScaleVec = IRB.CreateInsertElement(UndefValue::get(OffsetType),
+      IRB.getInt16(ElemNumBytes),
+      IRB.getInt32(0));
+    ScaleVec = IRB.CreateShuffleVector(
+      ScaleVec, UndefValue::get(OffsetType),
+      ConstantAggregateZero::get(
+        VectorType::get(IntegerType::getInt32Ty(pInst->getContext()), N)));
+    Offsets = IRB.CreateMul(Offsets, ScaleVec);
+    R.Indirect = Offsets;
+    R.Width = 1;
+    R.Stride = 0;
+    R.VStride = 0;
+  }
+  R.Mask = pInst->getArgOperand(0);
+  auto NewInst = cast<Instruction>(
+    R.createWrRegion(pLoadVecAlloca, pStoreVal, pInst->getName(),
+      pInst /*InsertBefore*/, pInst->getDebugLoc()));
+
+  IRB.CreateStore(NewInst, pVecAlloca);
+  pInst->eraseFromParent();
+}
+
+} // namespace
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromotePredicate.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromotePredicate.cpp
new file mode 100644
index 000000000000..c28809807f66
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromotePredicate.cpp
@@ -0,0 +1,204 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXPromotePredicate
+/// --------------------
+///
+/// GenXPromotePredicate is an optimization pass that promotes vector operations
+/// on predicates (n x i1) to operations on wider integer types (<n x i16>).
+/// This often reduces flag register pressure and improves code quality.
+///
+//===----------------------------------------------------------------------===//
+
+#include "GenX.h"
+#include "GenXModule.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace genx;
+
+static cl::opt<unsigned>
+LogicOpsThreshold("logical-ops-threshold", cl::init(2), cl::Hidden,
+                  cl::desc("Number of logical operations"));
+
+namespace {
+
+class GenXPromotePredicate : public FunctionPass {
+public:
+  static char ID;
+  GenXPromotePredicate() : FunctionPass(ID) {}
+  bool runOnFunction(Function &F) override;
+  StringRef getPassName() const override { return "GenXPromotePredicate"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<GenXModule>();
+    AU.setPreservesCFG();
+  }
+
+private:
+  bool matchOpnds(llvm::BasicBlock *UseBB, Value *V, unsigned &NumLogicOps);
+  Value *rewriteTree(Instruction *Inst);
+};
+
+} // namespace
+
+char GenXPromotePredicate::ID = 0;
+
+namespace llvm {
+void initializeGenXPromotePredicatePass(PassRegistry &);
+}
+INITIALIZE_PASS_BEGIN(GenXPromotePredicate, "GenXPromotePredicate",
+                      "GenXPromotePredicate", false, false)
+INITIALIZE_PASS_END(GenXPromotePredicate, "GenXPromotePredicate",
+                    "GenXPromotePredicate", false, false)
+
+FunctionPass *llvm::createGenXPromotePredicatePass() {
+  initializeGenXPromotePredicatePass(*PassRegistry::getPassRegistry());
+  return new GenXPromotePredicate;
+}
+
+// This matches a common pattern like
+//
+// v1.merge(v2, (v3 > 0) | (v4 < 9))
+//
+// Or operation will be beformed on <n x i1> which may cause flag spills when n
+// is large. We promote such computations into <n x i16>.
+//
+bool GenXPromotePredicate::runOnFunction(Function &F) {
+  // Collect candidates.
+  SmallVector<Instruction *, 8> Candidates;
+  for (auto &BB : F.getBasicBlockList()) {
+    for (auto &Inst : BB.getInstList()) {
+      auto SI = dyn_cast<SelectInst>(&Inst);
+      if (SI == nullptr || SI->use_empty())
+        continue;
+
+      // Match conditions with at least 32 elements.
+      auto Cond = dyn_cast<Instruction>(SI->getCondition());
+      if (!Cond || !Cond->getType()->isVectorTy())
+        continue;
+      if (Cond->getType()->getVectorNumElements() < 32)
+        continue;
+
+      // TODO: analyze when it is benefial to promote.
+      unsigned NumLogicOps = 0;
+      if (matchOpnds(SI->getParent(), Cond, NumLogicOps) &&
+          NumLogicOps >= LogicOpsThreshold)
+        Candidates.push_back(Cond);
+    }
+  }
+
+  // Do promotions. This is a tree rewrite, with candidates as root,
+  // comparisions or constants as leaf nodes.
+  for (auto Inst : Candidates) {
+    assert(Inst->hasOneUse());
+    Instruction *UI = Inst->user_back();
+    Value *V = rewriteTree(Inst);
+    assert(isa<Instruction>(V));
+    auto TI = TruncInst::Create(CastInst::Trunc, V, Inst->getType());
+    TI->insertAfter(cast<Instruction>(V));
+    TI->setDebugLoc(Inst->getDebugLoc());
+    UI->replaceUsesOfWith(Inst, TI);
+    RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+
+  return !Candidates.empty();
+}
+
+bool GenXPromotePredicate::matchOpnds(llvm::BasicBlock *UseBB, Value *V,
+                                      unsigned &NumLogicOps) {
+  auto Inst = dyn_cast<Instruction>(V);
+  // Constants are OK.
+  if (Inst == nullptr)
+    return isa<Constant>(V);
+
+  unsigned Opc = Inst->getOpcode();
+  switch (Opc) {
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    ++NumLogicOps;
+    // Match local definitions only.
+    if (!Inst->hasOneUse() || Inst->getParent() != UseBB)
+      return false;
+
+    // Recurse on its operands.
+    return matchOpnds(UseBB, Inst->getOperand(0), NumLogicOps) &&
+           matchOpnds(UseBB, Inst->getOperand(1), NumLogicOps);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    // Matching stops at local comparison operands.
+    return Inst->hasOneUse() && Inst->getParent() == UseBB;
+  default:
+    break;
+  }
+
+  // Not a match.
+  return false;
+}
+Value *GenXPromotePredicate::rewriteTree(Instruction *Inst) {
+  IRBuilder<> Builder(Inst);
+  unsigned N = Inst->getType()->getVectorNumElements();
+  VectorType *VT = VectorType::get(Builder.getInt16Ty(), N);
+  unsigned Opc = Inst->getOpcode();
+  switch (Opc) {
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    Value *Ops[] = {nullptr, nullptr};
+    for (unsigned i : {0, 1}) {
+      Value *Op = Inst->getOperand(i);
+      if (auto C = dyn_cast<Constant>(Op))
+        Ops[i] = Builder.CreateSExt(C, VT, ".sext");
+      else if (auto I = dyn_cast<Instruction>(Op))
+        Ops[i] = rewriteTree(I);
+      else
+        llvm_unreachable("out of sync");
+    }
+
+    Value *V = Builder.CreateBinOp(Instruction::BinaryOps(Opc), Ops[0], Ops[1]);
+    V->takeName(Inst);
+    if (auto I = dyn_cast<Instruction>(V))
+      I->setDebugLoc(Inst->getDebugLoc());
+    return V;
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    auto V = Builder.CreateSExt(Inst, VT, ".sext");
+    if (auto I = dyn_cast<Instruction>(V)) {
+      I->setDebugLoc(Inst->getDebugLoc());
+      Inst->moveBefore(I);
+    }
+    return V;
+  }
+  default:
+    break;
+  }
+
+  llvm_unreachable("out of sync");
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRawSendRipper.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRawSendRipper.cpp
new file mode 100644
index 000000000000..48b1172c4eb4
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRawSendRipper.cpp
@@ -0,0 +1,96 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXRawSendRipper
+/// -----------------
+///
+/// This pass tears down a series of raw send chained through the old value
+/// operand when it's safe.
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "GENX_RAWSENDRIPPER"
+#include "GenX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+class GenXRawSendRipper : public FunctionPass {
+
+public:
+  static char ID;
+  explicit GenXRawSendRipper() : FunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "GenX RAW send ripper";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+} // End anonymous namespace
+
+namespace llvm {
+void initializeGenXRawSendRipperPass(PassRegistry &);
+} // End namespace llvm
+
+char GenXRawSendRipper::ID = 0;
+INITIALIZE_PASS(GenXRawSendRipper, "GenXRawSendRipper",
+                "Rip chain of raw send", false, false)
+
+FunctionPass *llvm::createGenXRawSendRipperPass() {
+  initializeGenXRawSendRipperPass(*PassRegistry::getPassRegistry());
+  return new GenXRawSendRipper();
+}
+
+bool GenXRawSendRipper::runOnFunction(Function &F) {
+  bool Changed = false;
+  Value *True = ConstantInt::getTrue(F.getContext());
+  for (auto &BB : F)
+    for (auto &I : BB) {
+      if (GenXIntrinsic::getGenXIntrinsicID(&I) != GenXIntrinsic::genx_raw_send)
+        continue;
+      auto II = cast<IntrinsicInst>(&I);
+      if (II->getOperand(1) != True)
+        continue;
+      Value *Old = II->getOperand(5);
+      if (isa<UndefValue>(Old))
+        continue;
+      II->setOperand(5, UndefValue::get(Old->getType()));
+    }
+  return Changed;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXReduceIntSize.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXReduceIntSize.cpp
new file mode 100644
index 000000000000..2d56603ff330
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXReduceIntSize.cpp
@@ -0,0 +1,1038 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXReduceIntSize
+/// -----------------
+///
+/// GenXReduceIntSize is a function pass that reduces the size of vector int
+/// values where it can.
+///
+/// The semantics of the source language usually involve an operator such as +
+/// promoting its operands before performing the calculation. Typically, the
+/// front end compiler generates IR for the promotion without bothering to work
+/// out if it is unnecessary, as it is easier to work out if it is unnecessary
+/// in a later LLVM pass.
+///
+/// For scalar operations, LLVM already contains passes to do this. But it does
+/// not seem to for vectors, possibly because OpenCL does not have C-like
+/// promotion rules for vectors. CM does have C-like promotion rules for vectors,
+/// so we need to cope with unnecessarily promoted operations.
+///
+/// Operation of the pass
+/// ^^^^^^^^^^^^^^^^^^^^^
+///
+/// First it does a backwards scan, spotting where an instruction can be
+/// converted to a smaller int size because its result is used in other
+/// instructions that only use the lower part of the value (trunc, or an "and"
+/// with e.g. 0xff). The modified instruction with a smaller int size then
+/// needs a trunc inserting for each operand. When the pass reaches the
+/// instruction that is the input to that new trunc, it may be able to
+/// modify that one too. Thus a reduced int size gets propagated backwards.
+///
+/// Then it does a forwards scan, spotting where an instruction can be converted
+/// to a smaller int size because the operands have only the lower part of the
+/// value set (zext/sext, or an "and" with e.g. 0xff). The modified instruction with
+/// a smaller int size then needs a ZExt/SExt inserting. Thus the reduced int size
+/// is propagated forwards.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_REDUCEINTSIZE"
+
+#include "GenX.h"
+#include "GenXIntrinsics.h"
+#include "GenXModule.h"
+#include "GenXUtil.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+using namespace genx;
+
+namespace {
+
+// GenXReduceIntSize : reduce integer size
+class GenXReduceIntSize : public FunctionPass {
+  struct ValueNumBits {
+    unsigned NumBits;
+    bool IsSignExtended;
+    ValueNumBits(unsigned NumBits) : NumBits(NumBits), IsSignExtended(false) {}
+    ValueNumBits(unsigned NumBits, bool IsSignExtended)
+        : NumBits(NumBits), IsSignExtended(IsSignExtended) {}
+  };
+  bool Modified;
+public:
+  static char ID;
+  explicit GenXReduceIntSize() : FunctionPass(ID) { }
+  virtual StringRef getPassName() const { return "GenX reduce integer size"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunction(Function &F);
+private:
+  Instruction *reverseProcessInst(Instruction *Inst);
+  Value *truncValue(Value *V, unsigned NumBits, Instruction *InsertBefore,
+                    const DebugLoc &DL);
+  Instruction *forwardProcessInst(Instruction *Inst);
+  ValueNumBits getValueNumBits(Value *V, bool PreferSigned = false);
+  Value *getSplatValue(ShuffleVectorInst *SVI) const;
+};
+
+} // end anonymous namespace
+
+char GenXReduceIntSize::ID = 0;
+namespace llvm { void initializeGenXReduceIntSizePass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXReduceIntSize, "GenXReduceIntSize", "GenXReduceIntSize", false, false)
+INITIALIZE_PASS_END(GenXReduceIntSize, "GenXReduceIntSize", "GenXReduceIntSize", false, false)
+
+class ExtOperator : public Operator {
+public:
+  static bool isExtOpcode(unsigned Opc) {
+    return Opc == Instruction::SExt || Opc == Instruction::ZExt;
+  }
+  static inline bool classof(const Instruction *I) {
+    return isExtOpcode(I->getOpcode());
+  }
+  static inline bool classof(const ConstantExpr *CE) {
+    return isExtOpcode(CE->getOpcode());
+  }
+  static inline bool classof(const Value *V) {
+    return (isa<Instruction>(V) && classof(cast<Instruction>(V))) ||
+           (isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V)));
+  }
+};
+
+FunctionPass *llvm::createGenXReduceIntSizePass()
+{
+  initializeGenXReduceIntSizePass(*PassRegistry::getPassRegistry());
+  return new GenXReduceIntSize();
+}
+
+void GenXReduceIntSize::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.setPreservesCFG();
+}
+
+/***********************************************************************
+ * GenXReduceIntSize::runOnFunction : process one function to
+ *    reduce integer size where possible
+ */
+bool GenXReduceIntSize::runOnFunction(Function &F)
+{
+  // Reverse scan: This does a postordered depth first traversal of the CFG,
+  // processing instructions within a basic block in reverse, to ensure that we
+  // see a def after its uses (ignoring phi node uses).
+  Modified = false;
+  for (po_iterator<BasicBlock *> i = po_begin(&F.getEntryBlock()),
+      e = po_end(&F.getEntryBlock()); i != e; ++i) {
+    BasicBlock *BB = *i;
+    // This loop scans the BB in reverse, and allows processReverseInst to
+    // erase Inst and other instructions.
+    for (auto Inst = &BB->back(); Inst; )
+      Inst = reverseProcessInst(Inst);
+  }
+  // Forward scan: This does a preordered depth first traversal of the CFG to
+  // ensure that we see a def before its uses (ignoring phi node uses).
+  for (df_iterator<BasicBlock *> i = df_begin(&F.getEntryBlock()),
+      e = df_end(&F.getEntryBlock()); i != e; ++i) {
+    BasicBlock *BB = *i;
+    // This loop scans the BB forward, and allows processForwardInst to erase
+    // Inst and other instructions.
+    for (auto Inst = &BB->front(); Inst; )
+      Inst = forwardProcessInst(Inst);
+  }
+  return Modified;
+}
+
+/***********************************************************************
+ * getAndNumBits : get the number of lower bits set by an "and" instruction
+ */
+static unsigned getAndNumBits(Instruction *Inst)
+{
+  if (auto C = dyn_cast<Constant>(Inst->getOperand(1))) {
+    if ((C = C->getSplatValue())) {
+      uint64_t Val = cast<ConstantInt>(C)->getZExtValue();
+      return 64 - countLeadingZeros(Val, ZB_Width);
+    }
+  }
+  return Inst->getType()->getScalarType()->getPrimitiveSizeInBits();
+}
+
+/***********************************************************************
+ * getPrev : get the previous instruction, or 0 if at start of BB
+ * getNext : get the next instruction, or 0 if at end of BB
+ */
+static Instruction *getPrev(Instruction *Inst)
+{
+  if (&Inst->getParent()->front() == Inst)
+    return nullptr;
+  return Inst->getPrevNode();
+}
+
+static Instruction *getNext(Instruction *Inst)
+{
+  if (&Inst->getParent()->back() == Inst)
+    return nullptr;
+  return Inst->getNextNode();
+}
+
+/***********************************************************************
+ * reverseProcessInst : process one instruction in GenXReduceIntSize's
+ *      reverse scan
+ *
+ * Enter:   Inst = the instruction to process
+ *
+ * Return:  the previous instruction (after any erases done in here), 0 if
+ *          at start of block
+ */
+Instruction *GenXReduceIntSize::reverseProcessInst(Instruction *Inst)
+{
+  Instruction *Prev = getPrev(Inst);
+  // Ignore if not at least a 4 vector.
+  auto VT = dyn_cast<VectorType>(Inst->getType());
+  if (!VT)
+    return Prev;
+  if (!VT->getElementType()->isIntegerTy())
+    return Prev;
+  unsigned NumBits = VT->getElementType()->getPrimitiveSizeInBits();
+  if (NumBits == 1)
+      return Prev;
+  unsigned TruncBits = 0;
+  // See if the value is only used in instructions that use fewer bits (trunc,
+  // and, shl).  Get the max truncated size.
+  for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+    unsigned ThisTruncBits = NumBits;
+    auto user = cast<Instruction>(ui->getUser());
+    switch (user->getOpcode()) {
+    case Instruction::Trunc:
+      ThisTruncBits = user->getType()->getScalarType()->getPrimitiveSizeInBits();
+      break;
+    case Instruction::And:
+      ThisTruncBits = getAndNumBits(user);
+      break;
+    default:
+      ThisTruncBits = NumBits;
+      break;
+    }
+    TruncBits = std::max(TruncBits, ThisTruncBits);
+    if (TruncBits == NumBits)
+      break;
+  }
+  if (!TruncBits)
+    return Prev; // Inst is unused
+  // Round TruncBits up to next power of two no smaller than 8.
+  TruncBits = std::max(8, 1 << genx::log2(TruncBits * 2 - 1));
+  // If the instruction is not min/max, truncate to no smaller than 16.
+  switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+  case GenXIntrinsic::genx_smin:
+  case GenXIntrinsic::genx_umin:
+  case GenXIntrinsic::genx_smax:
+  case GenXIntrinsic::genx_umax:
+    break;
+  default:
+    TruncBits = std::max(TruncBits, 16U);
+    break;
+  }
+  if (TruncBits >= NumBits)
+    return Prev; // Inst is used somewhere that cannot truncate.
+  LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: can truncate to "
+      << TruncBits << " bits: " << *Inst << "\n");
+  Value *NewVal = nullptr;
+  Instruction *NewInst = nullptr;
+  // Put new code _after_ original instruction, so we don't see it again in
+  // this backwards pass.
+  Instruction *InsertBefore = Inst->getNextNode();
+  const DebugLoc &DL = Inst->getDebugLoc();
+  switch (Inst->getOpcode()) {
+  case Instruction::LShr:
+  case Instruction::AShr:
+    // An shr by constant needs N more bits, where N is the constant.
+    // That might still allow some truncation.
+    if (auto C = dyn_cast<Constant>(Inst->getOperand(1))) {
+      if ((C = C->getSplatValue())) {
+        TruncBits += cast<ConstantInt>(C)->getSExtValue();
+        // Round TruncBits up to next power of two no smaller than 8.
+        TruncBits = std::max(8, 1 << genx::log2(TruncBits * 2 - 1));
+        LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: actually can only truncate right shift to "
+            << TruncBits << " bits\n");
+        if (TruncBits < NumBits) {
+          NewInst = BinaryOperator::Create(
+              (Instruction::BinaryOps)Inst->getOpcode(),
+              truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL),
+              truncValue(Inst->getOperand(1), TruncBits, InsertBefore, DL),
+              "", InsertBefore);
+          break;
+        }
+      }
+    }
+    // Other shr cannot truncate.
+    return Prev;
+  case Instruction::And:
+    // An "and" by constant might be completely removable if the rhs truncates
+    // to all ones.
+    if (auto C = dyn_cast<Constant>(Inst->getOperand(1))) {
+      if (cast<Constant>(truncValue(C, TruncBits, InsertBefore, DL))
+          ->isAllOnesValue()) {
+        // Remove the "and".
+        NewVal = truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL);
+        break;
+      }
+    }
+    // Otherwise, fall through to treat "and" like the other truncatable
+    // binary ops.
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+    // These binary operators can just truncate.
+    NewInst = BinaryOperator::Create(
+        (Instruction::BinaryOps)Inst->getOpcode(),
+        truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL),
+        truncValue(Inst->getOperand(1), TruncBits, InsertBefore, DL),
+        "", InsertBefore);
+    break;
+  case Instruction::ZExt:
+  case Instruction::SExt: {
+      NewVal = Inst->getOperand(0);
+      unsigned NewBits = NewVal->getType()->getScalarType()
+          ->getPrimitiveSizeInBits();
+      if (TruncBits != NewBits) {
+        // The value still needs extending, just not as much as before. Or it
+        // might need to be truncated.
+        unsigned NumElements = cast<VectorType>(Inst->getType())
+            ->getNumElements();
+        int Opcode = Instruction::Trunc;
+        if (TruncBits > NewBits)
+          Opcode = Inst->getOpcode();
+        auto ElTy = Type::getIntNTy(InsertBefore->getContext(), TruncBits);
+        auto Ty = VectorType::get(ElTy, NumElements);
+        NewInst = CastInst::Create((Instruction::CastOps)Opcode, NewVal,
+            Ty, "", InsertBefore);
+      }
+    }
+    break;
+  case Instruction::ShuffleVector:
+    if (!cast<Constant>(Inst->getOperand(2))->isNullValue())
+      return Prev;
+    if (cast<VectorType>(Inst->getOperand(0)->getType())
+        ->getNumElements() == 1) {
+      // This shufflevector is a splat from a 1-vector.
+      auto TruncatedInput = truncValue(Inst->getOperand(0), TruncBits,
+          InsertBefore, DL);
+      NewInst = new ShuffleVectorInst(TruncatedInput,
+          UndefValue::get(TruncatedInput->getType()), Inst->getOperand(2), "",
+          InsertBefore);
+      break;
+    }
+    // Detect when the shufflevector is the second half of an
+    // insertelement+shufflevector sequence being used to implement
+    // a splat (and the insertelement has no other use). For example:
+    //  %splat.splatinsert.i = insertelement <16 x i32> undef, i32 %direction, i32 0, !dbg !355
+    //  %splat.splat.i = shufflevector <16 x i32> %splat.splatinsert.i, <16 x i32> undef, <16 x i32> zeroinitializer, !dbg !355
+    if (auto IE = dyn_cast<InsertElementInst>(Inst->getOperand(0))) {
+      if (IE->hasOneUse()) {
+        if (auto C = dyn_cast<Constant>(IE->getOperand(2))) {
+          if (C->isNullValue()) {
+            // This is a splat, and we can truncate it by creating new
+            // insertelement and shufflevector instructions.
+            unsigned NumElements = cast<VectorType>(Inst->getType())
+                ->getNumElements();
+            auto ElTy = Type::getIntNTy(InsertBefore->getContext(),
+                  TruncBits);
+            auto Ty = VectorType::get(ElTy, NumElements);
+            auto NewScalar = CastInst::Create(Instruction::Trunc,
+                IE->getOperand(1), ElTy,
+                IE->getOperand(1)->getName() + ".reduceintsize", InsertBefore);
+            NewScalar->setDebugLoc(IE->getDebugLoc());
+            auto NewIE = InsertElementInst::Create(UndefValue::get(Ty),
+                NewScalar, IE->getOperand(2), "", InsertBefore);
+            NewIE->setDebugLoc(IE->getDebugLoc());
+            NewIE->takeName(IE);
+            NewInst = new ShuffleVectorInst(NewIE, UndefValue::get(Ty),
+                Inst->getOperand(2), "", InsertBefore);
+            break;
+          }
+        }
+      }
+    }
+    return Prev;
+  default:
+    return Prev;
+  }
+  if (NewInst) {
+    NewInst->setDebugLoc(DL);
+    NewInst->takeName(Inst);
+    NewVal = NewInst;
+  }
+  assert(NewVal);
+  // NewVal is the replacement for Inst with a smaller int size.
+  LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: NewVal: " << *NewVal << "\n");
+  // Replace the uses of Inst, which we know are all things that
+  // have a reduced size requirement (trunc, and).
+  while (!Inst->use_empty()) {
+    Instruction *user = cast<Instruction>(Inst->use_begin()->getUser());
+    unsigned ThisTruncBits =
+        user->getType()->getScalarType()->getPrimitiveSizeInBits();
+    switch (user->getOpcode()) {
+    case Instruction::Trunc: {
+        auto ThisNewVal = NewVal;
+        if (ThisTruncBits != TruncBits) {
+          // We need a new trunc.
+          auto NewTI = CastInst::Create(Instruction::Trunc, NewVal, user->getType(),
+              "", user);
+          NewTI->takeName(user);
+          NewTI->setDebugLoc(user->getDebugLoc());
+          LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: NewTI: " << *NewTI << "\n");
+          ThisNewVal = NewTI;
+        }
+        user->replaceAllUsesWith(ThisNewVal);
+        user->eraseFromParent();
+      }
+      break;
+    case Instruction::And: {
+        auto ThisNewVal = NewVal;
+        unsigned AndBits = getAndNumBits(user);
+        if (AndBits != TruncBits) {
+          // We need a replacement "and" instruction with a different type.
+          auto NewAnd = BinaryOperator::Create(Instruction::And, NewVal,
+              truncValue(user->getOperand(1), TruncBits,
+                user, user->getDebugLoc()),
+              "", user);
+          NewAnd->takeName(user);
+          NewAnd->setDebugLoc(user->getDebugLoc());
+          LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: NewAnd: " << *NewAnd << "\n");
+          ThisNewVal = NewAnd;
+        }
+        if (ThisTruncBits != TruncBits) {
+          // Need to trunc or extend our new instruction's result to match
+          // the result of the "and".
+          assert(ThisNewVal);
+          auto NewCast = CastInst::Create(
+              ThisTruncBits > TruncBits ? Instruction::ZExt : Instruction::Trunc,
+              ThisNewVal, user->getType(), "", user);
+          if (NewVal == ThisNewVal)
+            NewCast->takeName(user);
+          else
+            NewCast->setName(ThisNewVal->getName() + ".cast");
+          NewCast->setDebugLoc(user->getDebugLoc());
+          LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: NewCast: " << *NewCast << "\n");
+          ThisNewVal = NewCast;
+        }
+        user->replaceAllUsesWith(ThisNewVal);
+        user->eraseFromParent();
+      }
+      break;
+    default:
+      assert(0 && "unexpected use");
+      break;
+    }
+  }
+  // Erase Inst. Its operands may now become unused, in which case remove
+  // those too.
+  auto Opnd0Inst = dyn_cast<Instruction>(Inst->getOperand(0));
+  Instruction *Opnd1Inst = nullptr;
+  if (Inst->getNumOperands() >= 2)
+    Opnd1Inst = dyn_cast<Instruction>(Inst->getOperand(1));
+  Inst->eraseFromParent();
+  if (Opnd0Inst && Opnd0Inst->use_empty()) {
+    if (Opnd0Inst == Prev)
+      Prev = getPrev(Prev);
+    Opnd0Inst->eraseFromParent();
+    if (Opnd0Inst == Opnd1Inst)
+      Opnd1Inst = nullptr;
+  }
+  if (Opnd1Inst && Opnd1Inst->use_empty()) {
+    if (Opnd1Inst == Prev)
+      Prev = getPrev(Prev);
+    Opnd1Inst->eraseFromParent();
+  }
+  Modified = true;
+  return Prev;
+}
+
+/***********************************************************************
+ * truncValue : get truncated version of value
+ *
+ * Enter:   V = value to truncate (might be constant)
+ *          NumBits = integer bit size to truncate to
+ *          InsertBefore = insert any new instruction before here
+ *          DL = debug loc for any new instruction
+ */
+Value *GenXReduceIntSize::truncValue(Value *V, unsigned NumBits,
+    Instruction *InsertBefore, const DebugLoc &DL)
+{
+  unsigned NumElements = cast<VectorType>(V->getType())->getNumElements();
+  auto ElTy = Type::getIntNTy(InsertBefore->getContext(), NumBits);
+  auto Ty = VectorType::get(ElTy, NumElements);
+  if (Ty == V->getType())
+    return V;
+  if (auto C = dyn_cast<Constant>(V)) {
+    if (isa<UndefValue>(C))
+      return UndefValue::get(Ty);
+    if (auto SV = C->getSplatValue()) {
+      auto AI = cast<ConstantInt>(SV)->getValue();
+      AI = AI.trunc(NumBits);
+      C = Constant::getIntegerValue(Ty, AI);
+      return C;
+    }
+    SmallVector<Constant *, 8> Vals;
+    if (auto CV = dyn_cast<ConstantVector>(C)) {
+      for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i)
+        Vals.push_back(CV->getOperand(i));
+      return ConstantVector::get(Vals);
+    } else if (auto CDV = dyn_cast<ConstantDataVector>(C)) {
+      for (unsigned i = 0, e = CDV->getNumElements(); i != e; ++i)
+        Vals.push_back(Constant::getIntegerValue(ElTy,
+              APInt(NumBits, CDV->getElementAsInteger(i))));
+      return ConstantVector::get(Vals);
+    }
+  }
+  // Not a constant.
+  if (auto Inst = dyn_cast<Instruction>(V)) {
+    switch (Inst->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt: {
+        // The value is the result of a truncate or extend.
+        // See if the input is already the right size.
+        Value *Input = Inst->getOperand(0);
+        if (Input->getType() == Ty)
+          return Input;
+        // Instead of truncating the value, truncate or extend the input.
+        auto NewInst = CastInst::Create(
+            Input->getType()->getScalarType()->getPrimitiveSizeInBits()
+              < NumBits ? (Instruction::CastOps)Inst->getOpcode()
+              : Instruction::Trunc,
+            Input, Ty, Inst->getName() + ".reduceintsize", InsertBefore);
+        NewInst->setDebugLoc(DL);
+        LLVM_DEBUG(dbgs() << "GenXReduceIntSize::truncVal: " << *NewInst << "\n");
+        return NewInst;
+      }
+    case Instruction::And:
+      if (auto C = dyn_cast<Constant>(Inst->getOperand(1))) {
+        auto VNB = getValueNumBits(C);
+        if (!VNB.IsSignExtended && VNB.NumBits >= NumBits) {
+          C = C->getSplatValue();
+          if (C) {
+            APInt Mask = C->getUniqueInteger();
+            if (Mask.isMask(NumBits))
+              // The value is the result of an "and" that only keeps bits
+              // within the truncated size. Just use its input.
+              return
+                  truncValue(Inst->getOperand(0), NumBits, InsertBefore, DL);
+          }
+        }
+      }
+      break;
+    default:
+      break;
+    }
+  }
+  // Create a new trunc instruction.
+  auto NewInst = CastInst::Create(Instruction::Trunc, V, Ty,
+      V->getName() + ".reduceintsize", InsertBefore);
+  NewInst->setDebugLoc(DL);
+  LLVM_DEBUG(dbgs() << "GenXReduceIntSize::truncVal: " << *NewInst << "\n");
+  return NewInst;
+}
+
+/***********************************************************************
+ * forwardProcessInst : process one instruction in GenXReduceIntSize's
+ *      forward scan
+ *
+ * Enter:   Inst = the instruction to process
+ *
+ * Return:  the next instruction (after any erases done in here), 0 if
+ *          at end of block
+ */
+Instruction *GenXReduceIntSize::forwardProcessInst(Instruction *Inst) {
+  Instruction *Next = getNext(Inst);
+  // Ignore if not at least a 4 vector.
+  auto VT = dyn_cast<VectorType>(Inst->getType());
+  if (!VT) {
+    Type *Ty = Inst->getType();
+    Value *A;
+    const APInt *Val;
+    // Transform (add zext(A), Val) to (zext (add zext(A), Val)).
+    if (Ty->isIntegerTy(32) &&
+        match(Inst, m_Add(m_ZExt(m_Value(A)), m_APInt(Val))))
+      if (A->getType()->isIntegerTy(8) && Val->isNonNegative() && Val->isIntN(8)) {
+        IRBuilder<> Builder(Inst);
+        IntegerType *I16Ty = Builder.getInt16Ty();
+        APInt NVal = Val->trunc(16);
+        Instruction *NewInst = cast<Instruction>(
+            Builder.CreateZExt(
+              Builder.CreateAdd(Builder.CreateZExt(A, I16Ty),
+                                ConstantInt::get(I16Ty, NVal)), Ty));
+        NewInst->takeName(Inst);
+        Inst->replaceAllUsesWith(NewInst);
+        Inst->eraseFromParent();
+        Modified = true;
+      }
+    return Next;
+  }
+  if (!VT->getElementType()->isIntegerTy())
+    return Next;
+  unsigned NumBits = VT->getElementType()->getPrimitiveSizeInBits();
+  if (NumBits == 1)
+    return Next;
+  unsigned TruncBits = NumBits;
+  bool NeedSignExtend = false;
+  Instruction *InsertBefore = Inst;
+  Instruction *NewInst = nullptr;
+  Value *NewVal = nullptr;
+  const DebugLoc &DL = Inst->getDebugLoc();
+  switch (Inst->getOpcode()) {
+  case Instruction::LShr:
+  case Instruction::AShr:
+    // Convert shl+shr pair back into trunc+ext here, because it makes it
+    // easier to handle an op that uses the result of it.
+    if (auto NewInst = convertShlShr(Inst)) {
+      auto Shl = cast<Instruction>(Inst->getOperand(0));
+      Inst->eraseFromParent();
+      if (Shl->use_empty())
+        Shl->eraseFromParent();
+      Inst = NewInst;
+    }
+    break;
+  default:
+    break;
+  }
+  auto IID = GenXIntrinsic::not_any_intrinsic;
+  switch (Inst->getOpcode()) {
+  case Instruction::ShuffleVector:
+    if (Value *V = getSplatValue(cast<ShuffleVectorInst>(Inst))) {
+      // Transform "splat (ext v)" to "ext (splat v)".
+      if (auto Ext = dyn_cast<ExtOperator>(V)) {
+        unsigned NumElts = Inst->getType()->getVectorNumElements();
+        IntegerType *I32Ty = Type::getInt32Ty(Inst->getContext());
+        VectorType *MaskTy = VectorType::get(I32Ty, NumElts);
+        Value *Mask = Constant::getNullValue(MaskTy);
+        Value *Src = Ext->getOperand(0);
+        if (!isa<VectorType>(Src->getType())) {
+          VectorType *VTy = VectorType::get(Src->getType(), NumElts);
+          Src =
+              InsertElementInst::Create(UndefValue::get(VTy), Src,
+                                        Constant::getNullValue(I32Ty), "",
+                                        InsertBefore);
+        }
+        NewInst =
+            new ShuffleVectorInst(Src, UndefValue::get(Src->getType()),
+                                  Mask, "", InsertBefore);
+        if (Ext->getOpcode() == Instruction::ZExt)
+          NewInst = new ZExtInst(NewInst, Inst->getType(), "", InsertBefore);
+        else
+          NewInst = new SExtInst(NewInst, Inst->getType(), "", InsertBefore);
+      }
+    }
+    break;
+  case Instruction::LShr: {
+      // LShr can just truncate as long as it does not need sign extending.
+      auto VNB0 = getValueNumBits(Inst->getOperand(0));
+      if (!VNB0.IsSignExtended)
+        TruncBits = VNB0.NumBits;
+    }
+    goto binop;
+  case Instruction::AShr: {
+      // AShr can just truncate as long as it does need sign extending.
+      auto VNB0 = getValueNumBits(Inst->getOperand(0),
+          /*PreferSigned=*/true);
+      if (VNB0.IsSignExtended) {
+        TruncBits = VNB0.NumBits;
+        NeedSignExtend = true;
+      }
+    }
+    goto binop;
+  case Instruction::And:
+    {
+      Value *A;
+      const APInt *Val;
+      if (match(Inst, m_And(m_Value(A), m_APInt(Val))) &&
+          Val->isMask(Val->getActiveBits())) {
+        TruncBits = std::max(16, 1 << genx::log2(Val->getActiveBits() * 2 - 1));
+        NeedSignExtend = false;
+        goto binop;
+      }
+      // "And" can just truncate, if both operands are truncated, and need the
+      // same kind of extension.
+      auto VNB0 = getValueNumBits(Inst->getOperand(0));
+      auto VNB1 = getValueNumBits(Inst->getOperand(1),
+            /*PreferSigned=*/VNB0.IsSignExtended);
+      if (VNB0.IsSignExtended == VNB1.IsSignExtended) {
+        TruncBits = std::max(VNB0.NumBits, VNB1.NumBits);
+        NeedSignExtend = VNB0.IsSignExtended;
+        // Round TruncBits up to next power of two no smaller than 8.
+        TruncBits = std::max(8, 1 << genx::log2(TruncBits * 2 - 1));
+        if (TruncBits < NumBits) {
+          auto Opnd1 = truncValue(Inst->getOperand(1), TruncBits, InsertBefore, DL);
+          if (auto C = dyn_cast<Constant>(Opnd1)) {
+            if (C->isAllOnesValue()) {
+              // An "and" with constant that is now all ones can be omitted.
+              // This bypasses the usual rule that an "and", like most other
+              // operators, should not be truncated smaller than 16.
+              LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: can truncate to " << TruncBits
+                  << " bits and remove completely: " << *Inst << "\n");
+              NewVal = truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL);
+              break;
+            }
+          }
+        }
+      }
+    }
+    goto binop;
+  case Instruction::Or:
+  case Instruction::Xor:
+    // These binary operators can just truncate, if both operands are
+    // truncated, and need the same kind of extension.
+    {
+      auto VNB0 = getValueNumBits(Inst->getOperand(0));
+      auto VNB1 = getValueNumBits(Inst->getOperand(1),
+            /*PreferSigned=*/VNB0.IsSignExtended);
+      if (VNB0.IsSignExtended == VNB1.IsSignExtended) {
+        TruncBits = std::max(VNB0.NumBits, VNB1.NumBits);
+        NeedSignExtend = VNB0.IsSignExtended;
+      }
+    }
+    goto binop;
+  case Instruction::Sub: {
+    Value *A;
+    const APInt *Val;
+    // Transforms (sub (zext A), (zext B)) to (zext (sub A, B)) if A is proved
+    // to be greater than B.
+    if (match(Inst, m_Sub(m_APInt(Val), m_ZExt(m_Value(A))))) {
+      unsigned ASize = A->getType()->getScalarSizeInBits();
+      if (ASize <= 16 && Val->trunc(ASize).isMaxValue()) {
+        TruncBits = 16;
+        goto binop;
+      }
+    }
+    break;
+  }
+  case Instruction::Call:
+    IID = GenXIntrinsic::getGenXIntrinsicID(Inst);
+    switch (IID) {
+    case GenXIntrinsic::genx_umin:
+    case GenXIntrinsic::genx_umax:
+    case GenXIntrinsic::genx_smin:
+    case GenXIntrinsic::genx_smax: {
+        // umin/umax/smin/smax can just truncate as long as both operands
+        // have the same type of extension. The type of extension (zero
+        // or signed) determines whether the truncated op is umin/umax or
+        // smin/smax:
+        //
+        // a = zext i16  1 to i32 = 0x00000001
+        // b = zext i16 -1 to i32 = 0x0000FFFF
+        // umax(a, b) = b = umax(trunc(a), trunc(b))
+        // smax(a, b) = b = umax(trunc(a), trunc(b))
+        //
+        // c = sext i16  1 to i32 = 0x00000001
+        // d = sext i16 -1 to i32 = 0xFFFFFFFF
+        // umax(c, d) = d = smax(trunc(c), trunc(d))
+        // smax(c, d) = c = smax(trunc(c), trunc(d))
+        //
+        auto VNB0 = getValueNumBits(Inst->getOperand(0));
+        auto VNB1 = getValueNumBits(Inst->getOperand(1),
+                                    /*PreferSigned=*/VNB0.IsSignExtended);
+        if (VNB0.IsSignExtended == VNB1.IsSignExtended) {
+          // Round TruncBits up to next power of two no smaller than 8.
+          // For min and max, allow byte operands.
+          TruncBits = std::max(VNB0.NumBits, VNB1.NumBits);
+          TruncBits = std::max(8, 1 << genx::log2(TruncBits * 2 - 1));
+
+          Type *SrcTy = Inst->getOperand(0)->getType();
+          unsigned SrcBits = SrcTy->getScalarSizeInBits();
+          // Only update IID when there is truncation in the source.
+          if (TruncBits < SrcBits) {
+            switch (IID) {
+            case GenXIntrinsic::genx_smax:
+            case GenXIntrinsic::genx_umax:
+              IID = VNB0.IsSignExtended ? GenXIntrinsic::genx_smax
+                                        : GenXIntrinsic::genx_umax;
+              break;
+            case GenXIntrinsic::genx_smin:
+            case GenXIntrinsic::genx_umin:
+              IID = VNB0.IsSignExtended ? GenXIntrinsic::genx_smin
+                                        : GenXIntrinsic::genx_umin;
+              break;
+            default:
+              break;
+            }
+          }
+        }
+      }
+      goto binop_truncate;
+    default:
+      break;
+    }
+    break;
+
+  binop:
+    // Round TruncBits up to next power of two no smaller than 16.
+    // Truncating to 8 bits often makes worse gen code because of the
+    // restrictions on byte operands in gen.
+    TruncBits = std::max(16, 1 << genx::log2(TruncBits * 2 - 1));
+  binop_truncate:
+    if (TruncBits < NumBits) {
+      LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: can truncate to " << TruncBits
+          << " bits: " << *Inst << "\n");
+      auto Opnd0 = truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL);
+      auto Opnd1 = truncValue(Inst->getOperand(1), TruncBits, InsertBefore, DL);
+      if (isa<BinaryOperator>(Inst)) {
+        // Create the replacement instruction: binary operator.
+        NewInst = BinaryOperator::Create(
+            (Instruction::BinaryOps)Inst->getOpcode(),
+            Opnd0, Opnd1, "", InsertBefore);
+      } else {
+        // Create the replacement instruction: intrinsic.
+        // If it is not the case that all uses trunc to TruncBits, then
+        // use the original size as the result type.
+        Type *ResTy = Opnd0->getType();
+        bool IsOneEltVecTy = false;
+        if (auto VTy = dyn_cast<VectorType>(ResTy))
+          IsOneEltVecTy = VTy->getNumElements() == 1;
+        for (auto ui = Inst->use_begin(), ue = Inst->use_end();
+            ui != ue; ++ui) {
+          auto User = cast<Instruction>(ui->getUser());
+          // Trace through 'extractelement' on single-element vector values.
+          if (IsOneEltVecTy &&
+              User->getOpcode() == Instruction::ExtractElement &&
+              User->hasOneUse())
+            User = User->user_back();
+          switch (User->getOpcode()) {
+          case Instruction::Trunc:
+            if (User->getType()->getScalarType()
+                ->getPrimitiveSizeInBits() == TruncBits) {
+              // Use is trunc to TruncBits: allow truncated result type
+              // for intrinsic.
+              continue;
+            }
+            break;
+          case Instruction::And:
+            if (auto C = dyn_cast<Constant>(User->getOperand(1))) {
+              auto VNB = getValueNumBits(C);
+              if (!VNB.IsSignExtended && VNB.NumBits <= TruncBits) {
+                // Use is and with no bits remaining outside bottom
+                // TruncBits: allow truncated result type for intrinsic.
+                continue;
+              }
+            }
+            break;
+          }
+          // Other cases: use the original size as the result type.
+          ResTy = Inst->getType();
+        }
+        TruncBits = ResTy->getScalarType()->getPrimitiveSizeInBits();
+        Type *Tys[] = { ResTy, Opnd0->getType() };
+        Function *Decl = GenXIntrinsic::getGenXDeclaration(
+            Inst->getParent()->getParent()->getParent(),
+            IID, Tys);
+        Value *Args[] = { Opnd0, Opnd1 };
+        NewInst = CallInst::Create(Decl, Args, "", InsertBefore);
+      }
+    }
+    break;
+  default:
+    break;
+  }
+  if (NewInst) {
+    NewInst->takeName(Inst);
+    NewInst->setDebugLoc(DL);
+    NewVal = NewInst;
+  }
+  if (!NewVal)
+    return Next;
+  LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: NewVal: " << *NewVal << "\n");
+  // Replace uses of Inst. The default is that we zero/sign extend back to the
+  // original size. However, if the use is in a trunc or zext/sext, then we can
+  // combine.
+  Instruction *Extended = nullptr;
+  while (!Inst->use_empty()) {
+    auto user = cast<Instruction>(Inst->use_begin()->getUser());
+    auto ThisNewVal = NewVal;
+    switch (user->getOpcode()) {
+    case Instruction::ZExt:
+      if (NeedSignExtend)
+        break;
+      goto combine;
+    case Instruction::SExt:
+      if (!NeedSignExtend)
+        break;
+      goto combine;
+    case Instruction::Trunc:
+    combine: {
+        unsigned TargetNumBits = user->getType()->getScalarType()
+            ->getPrimitiveSizeInBits();
+        if (TargetNumBits != TruncBits) {
+          auto NewCast = CastInst::Create(
+              TargetNumBits > TruncBits
+                  ? (NeedSignExtend ? Instruction::SExt : Instruction::ZExt)
+                  : Instruction::Trunc,
+              NewVal, user->getType(), "", user);
+          NewCast->takeName(user);
+          NewCast->setDebugLoc(user->getDebugLoc());
+          LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: NewCast: "
+              << *NewCast << "\n");
+          ThisNewVal = NewCast;
+        }
+        user->replaceAllUsesWith(ThisNewVal);
+        if (user == Next)
+          Next = getNext(Next);
+        user->eraseFromParent();
+      }
+      continue;
+    }
+    // Default case.
+    if (!Extended && NewVal->getType() == Inst->getType())
+      Extended = NewInst;
+    if (!Extended) {
+      Extended = CastInst::Create(
+          NeedSignExtend ? Instruction::SExt : Instruction::ZExt, NewVal,
+          Inst->getType(), NewVal->getName() + ".reduceintsize_extend",
+          Inst->getNextNode());
+      Extended->setDebugLoc(Inst->getDebugLoc());
+      LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: Extended: "
+          << *Extended << "\n");
+    }
+    *Inst->use_begin() = Extended;
+  }
+  // Erase Inst. Its operands may now become unused, in which case remove 
+  // those too.
+  auto Opnd0Inst = dyn_cast<Instruction>(Inst->getOperand(0));
+  Instruction *Opnd1Inst = nullptr;
+  if (Inst->getNumOperands() >= 2)
+    Opnd1Inst = dyn_cast<Instruction>(Inst->getOperand(1));
+  Inst->eraseFromParent();
+  if (Opnd0Inst && Opnd0Inst->use_empty()) {
+    if (Opnd0Inst == Next)
+      Next = getPrev(Next);
+    Opnd0Inst->eraseFromParent();
+  }
+  if (Opnd1Inst && Opnd1Inst->use_empty()) {
+    if (Opnd1Inst == Next)
+      Next = getPrev(Next);
+    Opnd1Inst->eraseFromParent();
+  }
+  Modified = true;
+  return Next;
+}
+
+/***********************************************************************
+ * getValueNumBits : get the number of bits needed for the vector int value
+ *
+ * Enter:   PreferSigned = return ValueNumBits with IsSignExtended true
+ *                         (and NumBits one greater) for a non-negative
+ *                         constant
+ *
+ * This just returns the number of bits in an element of the value, except
+ * for these special cases:
+ *
+ * 1. A splatted constant returns the number of bits required to represent
+ *    an element of the constant.
+ *
+ * 2. A ZExt returns the number of bits in an element of the _input_.
+ *
+ * 3. A SExt returns the number of bits in an element of the _input_, with the
+ *    flag to say it needs sign extending.
+ *
+ * 4. An "and" with splatted constant returns the number of bits required
+ *    to represent an element of that constant.
+ *
+ * This function returns a ValueNumBits, which contains:
+ *   - NumBits, number of bits required
+ *   - IsSignExtended, true if the missing bits are derived by sign extending
+ *        rather than zero extending
+ */
+GenXReduceIntSize::ValueNumBits GenXReduceIntSize::getValueNumBits(
+      Value *V, bool PreferSigned)
+{
+  unsigned NumBits = V->getType()->getScalarType()->getPrimitiveSizeInBits();
+  if (auto C = dyn_cast<Constant>(V)) {
+    if (C->getType()->isVectorTy())
+      C = C->getSplatValue();
+    if (C) {
+      int64_t Val = cast<ConstantInt>(C)->getSExtValue();
+      if (Val >= 0)
+        return ValueNumBits(64 - countLeadingZeros((uint64_t)Val, ZB_Width)
+            + PreferSigned, /*IsSignExtended=*/PreferSigned);
+      return ValueNumBits(63 - countLeadingZeros((uint64_t)-Val, ZB_Undefined),
+            /*IsSignExtended=*/true);
+    }
+    return NumBits;
+  }
+  auto Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return NumBits;
+  switch (Inst->getOpcode()) {
+  case Instruction::ZExt:
+    return static_cast<unsigned>(Inst->getOperand(0)
+                                     ->getType()
+                                     ->getScalarType()
+                                     ->getPrimitiveSizeInBits());
+  case Instruction::SExt:
+    return ValueNumBits(Inst->getOperand(0)->getType()->getScalarType()
+      ->getPrimitiveSizeInBits(), /*IsSignExtended=*/true);
+  case Instruction::And:
+    if (auto C = dyn_cast<Constant>(Inst->getOperand(1))) {
+      ValueNumBits VNB = getValueNumBits(C);
+      if (!VNB.IsSignExtended)
+        return VNB;
+    }
+    break;
+  }
+  return NumBits;
+}
+
+Value *GenXReduceIntSize::getSplatValue(ShuffleVectorInst *SVI) const {
+  if (!SVI->getMask()->isNullValue())
+    return nullptr;
+
+  Value *Src = SVI->getOperand(0);
+
+  if (auto IEI = dyn_cast<InsertElementInst>(Src)) {
+    auto C = dyn_cast<Constant>(IEI->getOperand(2));
+    if (C && C->isNullValue())
+      return IEI->getOperand(1);
+  }
+
+  if (cast<VectorType>(Src->getType())->getNumElements() == 1)
+    return Src;
+
+  return nullptr;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.cpp
new file mode 100644
index 000000000000..b79788e0945c
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.cpp
@@ -0,0 +1,954 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// Implementation of methods for Region class
+//
+//===----------------------------------------------------------------------===//
+
+#include "GenXRegion.h"
+#include "GenXAlignmentInfo.h"
+#include "GenXBaling.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "vc/GenXOpts/GenXAnalysis.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include <unordered_map>
+
+using namespace llvm;
+using namespace genx;
+
+/***********************************************************************
+ * getWithOffset : get a Region given a rdregion/wrregion, baling in
+ *      constant add of offset
+ *
+ * This constructs the Region with a variable index that is a constant add
+ * baled in (i.e. Region::Indirect and Region::Offset both set to the
+ * operands of the add). It is for use when baling information is not
+ * available, but the caller wants the constant offset separated out like
+ * that.
+ */
+Region Region::getWithOffset(Instruction *Inst, bool WantParentWidth)
+{
+  unsigned OperandNum = 0;
+  switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf:
+      OperandNum = GenXIntrinsic::GenXRegion::RdIndexOperandNum;
+      break;
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+    case GenXIntrinsic::genx_wrconstregion:
+      OperandNum = GenXIntrinsic::GenXRegion::WrIndexOperandNum;
+      break;
+    default:
+      llvm_unreachable("not rdregion or wrregion");
+      break;
+  }
+  BaleInfo BI;
+  if (GenXBaling::isBalableIndexAdd(Inst->getOperand(OperandNum)))
+    BI.setOperandBaled(OperandNum);
+  return Region(Inst, BI, WantParentWidth);
+}
+
+/***********************************************************************
+ * Region constructor from a rd/wr region and its BaleInfo
+ * This also works with rdpredregion and wrpredregion, with Offset in
+ * bits rather than bytes, and with ElementBytes set to 1.
+ */
+Region::Region(Instruction *Inst, const BaleInfo &BI, bool WantParentWidth)
+    : CMRegion()
+{
+  // Determine where to get the subregion value from and which arg index
+  // the region parameters start at.
+  unsigned ArgIdx = 0;
+  Value *Subregion = 0;
+  assert(isa<CallInst>(Inst));
+  auto CallI = cast<CallInst>(Inst);
+  assert(CallI->getCalledFunction());
+  switch (GenXIntrinsic::getGenXIntrinsicID(CallI->getCalledFunction())) {
+    case GenXIntrinsic::genx_rdpredregion:
+      NumElements = Inst->getType()->getVectorNumElements();
+      Width = NumElements;
+      Offset = cast<ConstantInt>(Inst->getOperand(1))->getZExtValue();
+      ElementBytes = 1;
+      return;
+    case GenXIntrinsic::genx_wrpredregion:
+      NumElements = Inst->getOperand(1)->getType()->getVectorNumElements();
+      Width = NumElements;
+      Offset = cast<ConstantInt>(Inst->getOperand(2))->getZExtValue();
+      ElementBytes = 1;
+      return;
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf:
+      ArgIdx = 1;
+      // The size/type of the region is given by the return value:
+      Subregion = Inst;
+      break;
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+    case GenXIntrinsic::genx_wrconstregion:
+      ArgIdx = 2;
+      // The size/type of the region is given by the "subregion value to
+      // write" operand:
+      Subregion = Inst->getOperand(1);
+      // For wrregion, while we're here, also get the mask. We set mask to NULL
+      // if the mask operand is constant 1 (i.e. not predicated).
+      Mask = Inst->getOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum);
+      if (auto C = dyn_cast<Constant>(Mask))
+        if (C->isAllOnesValue())
+          Mask = 0;
+      break;
+    default:
+      assert(0);
+  }
+  // Get the region parameters.
+  assert(Subregion);
+  ElementTy = Subregion->getType();
+  if (VectorType *VT = dyn_cast<VectorType>(ElementTy)) {
+    ElementTy = VT->getElementType();
+    NumElements = VT->getNumElements();
+  }
+  const DataLayout &DL = Inst->getModule()->getDataLayout();
+  assert(DL.getTypeSizeInBits(ElementTy) % genx::ByteBits  == 0);
+  ElementBytes = DL.getTypeSizeInBits(ElementTy) / genx::ByteBits;
+  VStride = cast<ConstantInt>(Inst->getOperand(ArgIdx))->getSExtValue();
+  Width = cast<ConstantInt>(Inst->getOperand(ArgIdx + 1))->getSExtValue();
+  Stride = cast<ConstantInt>(Inst->getOperand(ArgIdx + 2))->getSExtValue();
+  ArgIdx += 3;
+  // Get the start index.
+  Value *V = Inst->getOperand(ArgIdx);
+  assert(V->getType()->getScalarType()->isIntegerTy(16) &&
+         "region index must be i16 or vXi16 type");
+#if _DEBUG
+  if (VectorType *VT = dyn_cast<VectorType>(V->getType()))
+    assert(VT->getNumElements() * Width == NumElements &&
+           "vector region index size mismatch");
+#endif
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    Offset = CI->getSExtValue(); // Constant index.
+  else {
+    Indirect = V; // Index is variable; assume no baled in add.
+    if (BI.isOperandBaled(ArgIdx)) {
+      Instruction *Operator = cast<Instruction>(V);
+      // The index is variable and has something baled in. We want to process
+      // a baled in add or add_addr, and ignore a baled in rdregion.
+      if(!GenXIntrinsic::isRdRegion(Operator)) {
+        // The index is variable and has a baled in or/add/sub/add_addr.
+        assert((Operator->getOpcode() == Instruction::Add   ||
+                Operator->getOpcode() == Instruction::Sub   ||
+                Operator->getOpcode() == Instruction::Or    ||
+                GenXIntrinsic::getGenXIntrinsicID(Operator) == GenXIntrinsic::genx_add_addr)
+                && "error: your offset seems to be calculated not through 'add' or 'or' ");
+        Constant *C = cast<Constant>(Operator->getOperand(1));
+        ConstantInt *CI = dyn_cast<ConstantInt>(C);
+        if (!CI)
+          CI = cast<ConstantInt>(C->getSplatValue());
+
+        // check for or could be changed to add
+        if (Operator->getOpcode() == Instruction::Or &&
+          !haveNoCommonBitsSet(Operator->getOperand(0), Operator->getOperand(1),
+          Operator->getModule()->getDataLayout()))
+        {
+          assert(false && "or should be changed to add without any errors");
+        }
+
+
+        Offset = CI->getSExtValue();
+
+        if (Operator->getOpcode() == Instruction::Sub)
+          Offset = -Offset;
+
+        Indirect = Operator->getOperand(0);
+      }
+    }
+    // For a variable index, get the parent width arg.
+    ConstantInt *PW = dyn_cast<ConstantInt>(Inst->getOperand(ArgIdx + 1));
+    if (PW)
+      ParentWidth = PW->getZExtValue();
+  }
+  // We do some trivial legalization here. The legalization pass does not
+  // make these changes; instead we do them here so they are not permanently
+  // written back into the IR but are made on the fly each time some other
+  // pass uses this code to get the region info.
+  if (NumElements == 1) {
+    Width = Stride = 1;
+    VStride = 0;
+  } else {
+    if (NumElements <= Width) {
+      Width = NumElements;
+      VStride = 0;
+    } else if ((unsigned)VStride == Width * Stride) {
+      // VStride == Width * Stride, so we can canonicalize to a 1D region,
+      // but only if not indirect or not asked to preserve parentwidth,
+      // and never if multi-indirect.
+      if (!Indirect
+          || (!isa<VectorType>(Indirect->getType()) && !WantParentWidth)) {
+        Width = NumElements;
+        VStride = 0;
+        ParentWidth = 0;
+      }
+    } else if (Width == 1) {
+      // We can turn a 2D width 1 region into a 1D region, but if it is
+      // indirect it invalidates ParentWidth. So only do it if not asked
+      // to keep ParentWidth. Also we cannot do it if it is multi-indirect.
+      if (!Indirect
+          || (!isa<VectorType>(Indirect->getType()) && !WantParentWidth)) {
+        Width = NumElements;
+        Stride = VStride;
+        VStride = 0;
+        ParentWidth = 0;
+      }
+    }
+    if (Stride == 0 && Width == NumElements) {
+      // Canonical scalar region.
+      Width = 1;
+      VStride = 0;
+    }
+  }
+}
+
+/***********************************************************************
+ * Region::getLegalSize : get the max legal size of a region
+ *
+ * Enter:   Idx = start index into the subregion
+ *          Allow2D = whether to allow 2D region
+ *          InputNumElements = number of elements in whole input vector (so
+ *                we can tell if it is small enough that it cannot possibly
+ *                cross a GRF boundary)
+ *          ST = GenXSubtarget (so we can get gen specific crossing rules)
+ *          AI = 0 else AlignmentInfo (to determine alignment of indirect index)
+ */
+unsigned Region::getLegalSize(unsigned Idx, bool Allow2D,
+    unsigned InputNumElements, const GenXSubtarget *ST, AlignmentInfo *AI)
+{
+  Alignment Align;
+  if (Indirect) {
+    Align = Alignment::getUnknown();
+    if (AI)
+      Align = AI->get(Indirect);
+  }
+  return getLegalSize(Idx, Allow2D, InputNumElements, ST, Align);
+}
+
+/***********************************************************************
+ * Region::getLegalSize : get the max legal size of a region
+ *
+ * Enter:   Idx = start index into the subregion
+ *          Allow2D = whether to allow 2D region
+ *          InputNumElements = number of elements in whole input vector (so
+ *                we can tell if it is small enough that it cannot possibly
+ *                cross a GRF boundary)
+ *          ST = GenXSubtarget (so we can get gen specific crossing rules)
+ *          Align = alignment of indirect index if any
+ *
+ * The setting of Indirect is used as follows:
+ *
+ * 0: not indirect
+ * anything of scalar type: single indirect
+ * anything of vector type: multi indirect
+ */
+unsigned Region::getLegalSize(unsigned Idx, bool Allow2D,
+    unsigned InputNumElements, const GenXSubtarget *ST, Alignment Align)
+{
+  // Determine the max valid width.
+  unsigned ValidWidth = 1;
+  unsigned GRFWidth = ST ? ST->getGRFWidth() : 32;
+  int MaxStride = 4;
+  unsigned LogGRFWidth = genx::log2(GRFWidth);
+  if ((!Stride || exactLog2(Stride) >= 0) && (Allow2D || Stride <= MaxStride)) {
+    // The stride is legal, so we can potentially do more than one element at a
+    // time.
+    // Disallow 2D if the stride is too large for a real Gen region. For a
+    // source operand (Allow2D is true), we allow a 1D region with stride too
+    // large, because the vISA writer turns it into a 2D region with width 1.
+    bool StrideValid = (Stride <= MaxStride);
+
+    if (Indirect && isa<VectorType>(Indirect->getType())) {
+      // Multi indirect.
+      if (!Allow2D) {
+        // Multi indirect not allowed in wrregion.
+        if (!Stride)
+          ValidWidth = 1 << genx::log2(Width);
+      } else if (Width == 1 || !Stride) {
+        // Multi indirect with width 1 or stride 0.
+        // Return the max power of two number of elements that:
+        // 1. fit in 2 GRFs; and
+        // 2. fit in the whole region; and
+        // 3. fit in a row if the width is not legal
+        // 4. no more than 8 elements in multi indirect (because there
+        //    are only 8 elements in an address register).
+        unsigned LogWidth = genx::log2(Width);
+        if (1U << LogWidth == Width)
+          LogWidth = genx::log2(NumElements); // legal width
+        unsigned LogElementBytes = genx::log2(ElementBytes);
+        if (LogWidth + LogElementBytes > (LogGRFWidth + 1))
+          LogWidth = LogGRFWidth + 1 - LogElementBytes;
+        ValidWidth = 1 << LogWidth;
+        if (ValidWidth > 8)
+          ValidWidth = 8;
+      }
+      // Other multi indirect can only do one element.
+    } else {
+      // Calculate number of elements up to the boundary imposed by GRF
+      // crossing rules.
+      unsigned ElementsPerGRF = GRFWidth / ElementBytes;
+      unsigned OffsetElements = Offset / ElementBytes;
+      unsigned ElementsToBoundary = 1;
+      unsigned RealIdx = Idx / Width * VStride + Idx % Width * Stride;
+      if (!Indirect) {
+        // For a direct operand, just use the constant offset of the
+        // region and the index so far to calculate how far into a GRF this
+        // subregion starts, and set the boundary at the next-but-one GRF
+        // boundary.
+        unsigned NumGRF = 2;
+        ElementsToBoundary = (NumGRF * ElementsPerGRF) -
+                             ((RealIdx + OffsetElements) % ElementsPerGRF);
+      } else if (InputNumElements <= ElementsPerGRF) {
+        // Indirect region but the whole vector is no bigger than a GRF, so
+        // there is no limit imposed by GRF crossing.
+        ElementsToBoundary = ElementsPerGRF;
+      } else {
+        // For an indirect region, calculate the min and max index (including
+        // offset) from the region parameters, and add on the current start
+        // index to both.
+        // For <= BDW:
+        //   1. If the min and max then are in the same GRF, then the distance
+        //      from max to the next GRF boundary is the allowed size.
+        // For >= SKL:
+        //   1. If the min and max then are in the same GRF, then the distance
+        //      from max to the next-but-one GRF boundary is the allowed size.
+        //   2. If the max is in the next GRF after min, then the distance
+        //      from max to the next GRF boundary is the allowed size.
+        // However vISA imposes the restriction that, in a source indirect
+        // region, a row cannot cross a GRF, unless the region is contiguous.
+        // Pending a proper fix, we have a temporary fix here that we disallow
+        // GRF crossing completely unless the original region is a destination
+        // operand or is a 1D source operand (so GenXVisaFuncWriter can turn it
+        // into Nx1 instead of 1xN).  We use Allow2D as a proxy for "is source
+        // operand".
+        unsigned GRFsPerIndirect = 1;
+        assert(ST);
+        if (ST->hasIndirectGRFCrossing() &&
+          // SKL+. See if we can allow GRF crossing.
+            (Allow2D || !is2D())) {
+            GRFsPerIndirect = 2;
+        }
+        unsigned Last = (NumElements / Width - 1) * VStride + (Width - 1) * Stride;
+        unsigned Max = InputNumElements - Last - 1 + RealIdx;
+        unsigned Min = RealIdx;
+        unsigned MinMaxGRFDiff = (Max & -ElementsPerGRF) - (Min & -ElementsPerGRF);
+        if (!MinMaxGRFDiff) // min and max in same GRF
+          ElementsToBoundary = ElementsPerGRF * GRFsPerIndirect
+              - (Max & (ElementsPerGRF - 1));
+        else if (MinMaxGRFDiff == 1 && GRFsPerIndirect > 1)
+          ElementsToBoundary = ElementsPerGRF - (Max & (ElementsPerGRF - 1));
+        // We may be able to refine an indirect region legal width further...
+        if (exactLog2(ParentWidth) >= 0
+            && ParentWidth <= ElementsPerGRF) {
+          // ParentWidth tells us that a row of our region cannot cross a GRF
+          // boundary. Say that the boundary is at the next multiple of
+          // ParentWidth.
+          ElementsToBoundary = std::max(ParentWidth - RealIdx % ParentWidth,
+                ElementsToBoundary);
+        } else if (!isa<VectorType>(Indirect->getType())) {
+          // Use the alignment+offset of the single indirect index, with alignment
+          // limited to one GRF.
+          if (!Align.isUnknown()) {
+            unsigned LogAlign = Align.getLogAlign();
+            unsigned ExtraBits = Align.getExtraBits();
+            ExtraBits += (Offset + RealIdx * ElementBytes);
+            ExtraBits &= ((1 << LogAlign) - 1);
+            if (LogAlign >= LogGRFWidth && !ExtraBits) {
+              // Start is GRF aligned, so legal width is 1 GRF for <=BDW or
+              // 2 GRFs for >=SKL.
+              ElementsToBoundary = ElementsPerGRF * GRFsPerIndirect;
+            } else if (LogAlign > (unsigned)genx::log2(ElementBytes) ||
+                       (LogAlign == (unsigned)genx::log2(ElementBytes) &&
+                        ExtraBits == 0)) {
+              LogAlign = std::min(LogGRFWidth, LogAlign) - genx::log2(ElementBytes);
+              ExtraBits = (ExtraBits & (GRFWidth-1)) >> genx::log2(ElementBytes);
+              // We have some alignment, so we can say that the next GRF boundary
+              // is (at least) that many elements away, minus the offset from that
+              // alignment.
+              // For SKL+, we can cross one GRF boundary, so add on one GRF's
+              // worth.
+              unsigned ElementsToBoundaryFromAlign = (1U << LogAlign) - ExtraBits;
+              ElementsToBoundaryFromAlign += (GRFsPerIndirect - 1) * ElementsPerGRF;
+              ElementsToBoundary = std::max(ElementsToBoundaryFromAlign,
+                  ElementsToBoundary);
+            }
+          }
+        }
+      }
+
+      // Now calculate what subregion we can fit in before the boundary
+      // calculated above.
+      if (Allow2D && StrideValid) {
+        if ((!VStride || exactLog2(VStride) >= 0) && exactLog2(Width) >= 0
+          && Width <= 16 && !(Idx % Width)
+            && ElementsToBoundary >= (Width - 1) * Stride + 1) {
+          // The vstride and width are legal, and we're at the start of a
+          // row, and ElementsToBoundary is big enough for at least one
+          // whole row, so we can potentially do more than one whole row at a
+          // time. See how many we can fit, without including the "slack"
+          // at the end of the last row.
+          unsigned NumRows = 0;
+          if (VStride == 0) // Avoid divide by 0
+            NumRows = (NumElements - Idx) / Width;
+          else {
+            unsigned LastElementOfRow = (Width - 1) * Stride;
+            unsigned Slack = VStride - (LastElementOfRow + 1);
+            NumRows = (ElementsToBoundary + Slack) / VStride;
+            if (NumRows) {
+              if (NumRows * Width + Idx > NumElements)
+                NumRows = (NumElements - Idx) / Width;
+            }
+          }
+          ValidWidth = (1 << genx::log2(NumRows)) * Width;
+        }
+        if (ValidWidth == 1 && Idx % Width) {
+          // That failed. See if we can legally get to the end of the row then
+          // the same number of elements again at the start of the next row.
+          unsigned ToEndOfRow = Width - Idx % Width;
+          if (exactLog2(ToEndOfRow) >= 0 && ToEndOfRow <= 16) {
+            unsigned NewVStride = VStride + (ToEndOfRow - Width) * Stride;
+            if (exactLog2(NewVStride) >= 0
+                && NewVStride + (ToEndOfRow - 1) * Stride < ElementsToBoundary) {
+              // Yes, we can do the end of one row and the same size start of
+              // the next row.
+              ValidWidth = 2 * ToEndOfRow;
+            }
+          }
+        }
+      }
+      if (ValidWidth == 1) {
+        // That failed. See how many elements we can get, no further than the
+        // next end of row.
+        ValidWidth = Width - Idx % Width;
+        if (ValidWidth * Stride - (Stride - 1) > ElementsToBoundary)
+          ValidWidth = (ElementsToBoundary + Stride - 1) / Stride;
+        ValidWidth = 1 << genx::log2(ValidWidth);
+      }
+      // If the RStride is 0 (which is seen in splat operations) then the
+      // above logic tends to determine that all of the elements can fit,
+      // irrespective of vector size and type. This is usually incorrect
+      // in the wider context, so clamp it here to whatever fits in 2GRF if
+      // necessary
+      if (ValidWidth > (2 * ElementsPerGRF))
+        ValidWidth = 2 * ElementsPerGRF;
+
+    }
+  }
+  return ValidWidth;
+}
+
+/***********************************************************************
+ * RdWrRegionSequence::buildFromStartWr:  detect a split (legalized)
+ *    sequence rdregion-wrregion from the start, and populate the
+ *    RdWrRegionSequence object with its details
+ *
+ * This fails if there is any predication. It succeeds with a sequence length
+ * of one (i.e. a single rdregion-wrregion pair).
+ *
+ * On success, if the WaitingFor field matches one of the wrregions in the
+ * sequence, then WaitingFor is reset to 0. This is used by buildFromWr to
+ * check that the sequence includes the wrregion originally passed to it.
+ *
+ * On failure, EndWr is left as is, which means that isNull() continues to
+ * be true.
+ */
+bool RdWrRegionSequence::buildFromStartWr(Instruction *ArgStartWr,
+    GenXBaling *Baling)
+{
+  StartWr = ArgStartWr;
+  auto Wr = StartWr;
+  assert(GenXIntrinsic::isWrRegion(Wr));
+  Region TotalWrR(Wr, Baling->getBaleInfo(Wr));
+  WrR = TotalWrR;
+  if (TotalWrR.Mask)
+    return false;
+  OldVal = Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+  auto RdVal = Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+  if (auto Rd = dyn_cast<Instruction>(RdVal)) {
+    // Handle the case that the start wrregion has a rdregion, so we look for
+    // a sequence of rd-wr pairs.
+    if (!GenXIntrinsic::isRdRegion(Rd))
+      return false;
+    Region TotalRdR(Rd, Baling->getBaleInfo(Rd));
+    RdR = TotalRdR;
+    Input = Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+    EndWr = Wr;
+    if (Wr == WaitingFor)
+      WaitingFor = nullptr;
+    bool SeenWaitingFor = false;
+    for (;;) {
+      if (!Wr->hasOneUse() || Wr->use_begin()->getOperandNo()
+          != GenXIntrinsic::GenXRegion::OldValueOperandNum)
+        break;
+      Wr = cast<Instruction>(Wr->use_begin()->getUser());
+      if (!GenXIntrinsic::isWrRegion(Wr))
+        break;
+      Value *In = Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+      if (!GenXIntrinsic::isRdRegion(In))
+        break;
+      auto Rd = cast<Instruction>(In);
+      if (Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum) != Input)
+        break;
+      // Append to the regions. Give up if either fails.
+      if (!TotalRdR.append(Region(Rd, Baling->getBaleInfo(Rd)))
+          || !TotalWrR.append(Region(Wr, Baling->getBaleInfo(Wr))))
+        break;
+      SeenWaitingFor |= Wr == WaitingFor;
+      // If both regions are now legal (have a whole number of rows), then
+      // save the current position.
+      if (TotalRdR.isWholeNumRows() && TotalWrR.isWholeNumRows()) {
+        RdR = TotalRdR;
+        WrR = TotalWrR;
+        EndWr = Wr;
+        if (SeenWaitingFor)
+          WaitingFor = nullptr;
+      }
+    }
+    return true;
+  }
+  if (!isa<UndefValue>(Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)))
+    return false;
+  auto TotalC = dyn_cast<Constant>(RdVal);
+  if (!TotalC)
+    return false;
+  // Handle the case that the start wrregion has a constant "new value" operand
+  // and an undef "old value" operand.
+  // We look for a sequence of wrregions where the "new value" operands are all
+  // constant and we derive the overall constant.
+  Region TotalRdR(TotalC);
+  RdR = TotalRdR;
+  Input = TotalC;
+  EndWr = Wr;
+  if (Wr == WaitingFor)
+    WaitingFor = nullptr;
+  bool SeenWaitingFor = false;
+  for (;;) {
+    if (!Wr->hasOneUse() || Wr->use_begin()->getOperandNo()
+        != GenXIntrinsic::GenXRegion::OldValueOperandNum)
+      break;
+    Wr = cast<Instruction>(Wr->use_begin()->getUser());
+    if (!GenXIntrinsic::isWrRegion(Wr))
+      break;
+    auto In = dyn_cast<Constant>(Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+    if (!In)
+      break;
+    // Append to the regions. Give up if either fails.
+    Region InR(In);
+    InR.Offset = TotalRdR.NumElements * TotalRdR.ElementBytes;
+    if (!TotalRdR.append(InR)
+        || !TotalWrR.append(Region(Wr, Baling->getBaleInfo(Wr))))
+      break;
+    SeenWaitingFor |= Wr == WaitingFor;
+    // Append the constant.
+    TotalC = concatConstants(TotalC, In);
+    // If both regions are now legal (have a whole number of rows), then save
+    // the current position.
+    if (TotalRdR.isWholeNumRows() && TotalWrR.isWholeNumRows()) {
+      RdR = TotalRdR;
+      WrR = TotalWrR;
+      EndWr = Wr;
+      Input = TotalC;
+      if (SeenWaitingFor)
+        WaitingFor = nullptr;
+    }
+  }
+  return true;
+}
+
+/***********************************************************************
+ * RdWrRegionSequence::buildFromWr:  detect a split (legalized) rdregion-wrregion
+ *    sequence starting from any wrregion within it, and populate the
+ *    RdWrRegionSequence object with its details
+ *
+ * This fails if there is any predication. It succeeds with a sequence length
+ * of one (i.e. a single rdregion-wrregion pair).
+ *
+ * On failure, EndWr is left as is, which means that isNull() continues to
+ * be true.
+ */
+bool RdWrRegionSequence::buildFromWr(Instruction *Wr, GenXBaling *Baling)
+{
+  // Remember that our sequence needs to contain Wr.
+  WaitingFor = Wr;
+  // Scan back to what looks like the start of the sequence.
+  assert(GenXIntrinsic::isWrRegion(Wr));
+  StartWr = Wr;
+  auto RdVal = Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+  auto Rd = dyn_cast<Instruction>(RdVal);
+  bool ConstSequence = false;
+  if (!Rd) {
+    if (!isa<Constant>(RdVal))
+      return 0;
+    ConstSequence = true;
+  } else
+    Input = Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+  for (;;) {
+    Wr = dyn_cast<Instruction>(
+        Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+    if (!GenXIntrinsic::isWrRegion(Wr))
+      break;
+    assert(Wr);
+    if (!Wr->hasOneUse())
+      break;
+    RdVal = Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+    if (ConstSequence) {
+      if (!isa<Constant>(RdVal))
+        break;
+    } else {
+      Rd = dyn_cast<Instruction>(
+          Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+      if (!Rd)
+        break;
+      if (Input != Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum))
+        break;
+    }
+    StartWr = Wr;
+  }
+  // Try detecting a split rdregion-wrregion starting at StartWr.
+  for (;;) {
+    if (!buildFromStartWr(StartWr, Baling)) {
+      EndWr = nullptr;
+      return false;
+    }
+    if (!WaitingFor)
+      return true; // success
+    // The detected sequence did not include the wrregion this function
+    // started with. Retry with the following sequence.
+    StartWr = cast<Instruction>(EndWr->use_begin()->getUser());
+    if (GenXIntrinsic::isWrRegion(StartWr))
+      return false;
+  }
+}
+
+/***********************************************************************
+ * RdWrRegionSequence::buildFromRd:  detect a split (legalized) rdregion-wrregion
+ *    sequence starting from any rdregion within it, and populate the
+ *    RdWrRegionSequence object with its details
+ *
+ * This fails if there is any predication. It succeeds with a sequence length
+ * of one (i.e. a single rdregion-wrregion pair).
+ */
+bool RdWrRegionSequence::buildFromRd(Instruction *Rd, GenXBaling *Baling)
+{
+  assert(GenXIntrinsic::isRdRegion(Rd));
+  if (!Rd->hasOneUse())
+    return false;
+  if (Rd->use_begin()->getOperandNo() != GenXIntrinsic::GenXRegion::NewValueOperandNum)
+    return false;
+  auto Wr = cast<Instruction>(Rd->use_begin()->getUser());
+  if (!GenXIntrinsic::isWrRegion(Wr))
+    return false;
+  return buildFromWr(Wr, Baling);
+}
+
+/***********************************************************************
+ * RdWrRegionSequence::size : get number of rdregion-wrregion pairs in the
+ *    sequence
+ */
+unsigned RdWrRegionSequence::size() const
+{
+  unsigned Size = 1;
+  Instruction *Wr = EndWr;
+  for ( ; Wr != StartWr; ++Size)
+    Wr = cast<Instruction>(
+        Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+  return Size;
+}
+
+/***********************************************************************
+ * RdWrRegionSequence::isOnlyUseOfInput : check whether the sequence is the
+ *    only use of its input
+ */
+bool RdWrRegionSequence::isOnlyUseOfInput() const
+{
+  unsigned Count = 0;
+  for (auto ui = Input->use_begin(), ue = Input->use_end();
+      ui != ue; ++ui)
+    ++Count;
+  return Count == size();
+}
+
+/***********************************************************************
+ * RdWrRegionSequence::getRdIndex : get the index of the legalized rdregion
+ */
+Value *RdWrRegionSequence::getRdIndex() const
+{
+  if (isa<Constant>(Input))
+    return ConstantInt::get(Type::getInt16Ty(StartWr->getContext()), 0);
+  auto Rd = cast<Instruction>(
+      StartWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+  assert(GenXIntrinsic::isRdRegion(Rd));
+  return Rd->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum);
+}
+
+/***********************************************************************
+ * RdWrRegionSequence::getWrIndex : get the index of the legalized wrregion
+ */
+Value *RdWrRegionSequence::getWrIndex() const
+{
+  return StartWr->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
+}
+
+/***********************************************************************
+ * RdWrRegionSequence::getInputUse : get some use of Input in the sequence
+ *
+ * This only works if the RdWrRegionSequence is a sequence of rd-wr pairs,
+ * rather than a sequence of wrregions with constant input. In the latter
+ * case, this returns 0.
+ */
+Use *RdWrRegionSequence::getInputUse() const
+{
+  auto Rd = dyn_cast<Instruction>(
+      StartWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+  if (!GenXIntrinsic::isRdRegion(Rd))
+    return nullptr;
+  assert(Rd && Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum) == Input);
+  return &Rd->getOperandUse(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+}
+
+/***********************************************************************
+ * RdWrRegionSequence::print : debug dump/print
+ */
+void RdWrRegionSequence::print(raw_ostream &OS) const
+{
+  if (isNull())
+    OS << "null";
+  else {
+    OS << "sequence";
+    if (OldVal)
+      dbgs() << " OldVal=" << OldVal->getName();
+    dbgs() << " Input=" << Input->getName()
+      << " StartWr=" << StartWr->getName()
+      << " EndWr=" << EndWr->getName()
+      << " RdR=" << RdR
+      << " WrR=" << WrR;
+  }
+}
+
+static Value *simplifyRegionWrite(Instruction *Inst) {
+  assert(GenXIntrinsic::isWrRegion(Inst));
+  Value *NewVal = Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+
+  // Replace C with A
+  // C = wrregion(A, undef, R)
+  if (isa<UndefValue>(NewVal))
+    return Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+
+  // When A and undef have the same type, replace C with A
+  // B = rdregion(A, R)
+  // C = wrregion(undef, B, R)
+  //
+  // or replace C by A
+  //
+  // B = rdregion(A, R)
+  // C = wrregion(A, B, R)
+  //
+  if (GenXIntrinsic::isRdRegion(NewVal)) {
+    Instruction *B = cast<Instruction>(NewVal);
+    Region InnerR(B, BaleInfo());
+    Region OuterR(Inst, BaleInfo());
+    if (OuterR != InnerR)
+      return nullptr;
+
+    auto OldValB = B->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+    auto OldValC = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+    if ((isa<UndefValue>(OldValC) &&
+         OldValB->getType() == OldValC->getType()) ||
+        OldValB == OldValC)
+      return OldValB;
+  }
+
+  return nullptr;
+}
+
+static Value *simplifyRegionRead(Instruction *Inst) {
+  assert(GenXIntrinsic::isRdRegion(Inst));
+  Value *Input = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+  if (isa<UndefValue>(Input))
+    return UndefValue::get(Inst->getType());
+  else if (auto C = dyn_cast<Constant>(Input)) {
+    if (auto Splat = C->getSplatValue()) {
+      Type *Ty = Inst->getType();
+      if (Ty->isVectorTy())
+        Splat = ConstantVector::getSplat(Ty->getVectorNumElements(), Splat);
+      return Splat;
+    }
+  } else if (GenXIntrinsic::isWrRegion(Input) && Input->hasOneUse()) {
+    // W = wrr(A, B, R)
+    // C = rdr(W, R)
+    // =>
+    // replace C by B
+    Instruction *WI = cast<Instruction>(Input);
+    Region R1(WI, BaleInfo());
+    Region R2(Inst, BaleInfo());
+    if (R1 == R2) {
+      Value *B = WI->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+      if (B->getType() == Inst->getType())
+        return B;
+    }
+  }
+  return nullptr;
+}
+
+// Simplify a region read or write.
+Value *llvm::genx::simplifyRegionInst(Instruction *Inst, const DataLayout *DL) {
+  if (Inst->use_empty())
+    return nullptr;
+
+  if (Constant *C = ConstantFoldGenX(Inst, *DL))
+    return C;
+
+  unsigned ID = GenXIntrinsic::getGenXIntrinsicID(Inst);
+  switch (ID) {
+  case GenXIntrinsic::genx_wrregionf:
+  case GenXIntrinsic::genx_wrregioni:
+    return simplifyRegionWrite(Inst);
+  case GenXIntrinsic::genx_rdregionf:
+  case GenXIntrinsic::genx_rdregioni:
+    return simplifyRegionRead(Inst);
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+bool llvm::genx::simplifyRegionInsts(Function *F, const DataLayout *DL) {
+  bool Changed = false;
+  for (auto &BB : F->getBasicBlockList()) {
+    for (auto I = BB.begin(); I != BB.end();) {
+      Instruction *Inst = &*I++;
+      if (auto V = simplifyRegionInst(Inst, DL)) {
+        Inst->replaceAllUsesWith(V);
+        Inst->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+// Cleanup loads.
+// %load1 = load *m
+// %load2 = load *m
+// no store to m
+// use(load1, load2)
+//
+bool llvm::genx::cleanupLoads(Function *F) {
+  bool Changed = false;
+  for (auto &BB : F->getBasicBlockList()) {
+    // The dominating loads (may have different types) for each variable.
+    std::unordered_map<GlobalVariable *, std::vector<LoadInst *>> DomLoads;
+    for (auto I = BB.begin(); I != BB.end();) {
+      Instruction *Inst = &*I++;
+      if (auto SI = dyn_cast<StoreInst>(Inst)) {
+        auto GV = getUnderlyingGlobalVariable(SI->getPointerOperand());
+        if (!GV)
+          continue;
+        // Kill all live loads on this variable.
+        DomLoads[GV].clear();
+      } else if (auto LI = dyn_cast<LoadInst>(Inst)) {
+        auto GV = getUnderlyingGlobalVariable(LI->getPointerOperand());
+        if (!GV)
+          continue;
+        auto &Loads = DomLoads[GV];
+        LoadInst *DomLI = nullptr;
+        for (auto LI1 : Loads) {
+          if (LI1->getType() == LI->getType()) {
+            DomLI = LI1;
+            break;
+          }
+        }
+        if (DomLI == nullptr)
+          Loads.push_back(LI);
+        else {
+          LI->replaceAllUsesWith(DomLI);
+          LI->eraseFromParent();
+          Changed = true;
+        }
+      }
+    }
+  }
+  return Changed;
+}
+
+bool
+llvm::genx::IsLinearVectorConstantInts(Value* v, int64_t& start, int64_t& stride) {
+    auto cv = dyn_cast<ConstantDataVector>(v);
+    if (!cv)
+        return false;
+    // Flatten the vector out into the elements array
+    llvm::SmallVector<llvm::Constant*, 16> elements;
+    auto vectorLength = cv->getType()->getVectorNumElements();
+    for (unsigned i = 0; i < vectorLength; ++i)
+        elements.push_back(cv->getElementAsConstant(i));
+
+    llvm::ConstantInt* ci = llvm::dyn_cast<llvm::ConstantInt>(elements[0]);
+    if (ci == NULL)
+        return false; // Not a vector of integers
+
+    int64_t val0 = ci->getSExtValue();
+    if (vectorLength == 1) {
+        start = val0;
+        stride = 0;
+        return true;
+    }
+    ci = llvm::dyn_cast<llvm::ConstantInt>(elements[1]);
+    if (ci == NULL)
+        return false; // Not a vector of integers
+    int64_t prevVal = ci->getSExtValue();
+    int64_t diff = prevVal - val0;
+
+    // For each element in the array, see if it is both a ConstantInt and
+    // if the difference between it and the value of the previous element
+    // is stride.  If not, fail.
+    for (int i = 2; i < (int)vectorLength; ++i) {
+        ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
+        if (ci == NULL)
+            return false;
+
+        int64_t nextVal = ci->getSExtValue();
+        if (prevVal + diff != nextVal)
+            return false;
+
+        prevVal = nextVal;
+    }
+    start = val0;
+    stride = diff;
+    return true;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.h
new file mode 100644
index 000000000000..6e312bf90a31
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.h
@@ -0,0 +1,197 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXRegion : region information
+/// -------------------------------
+/// 
+/// Refer to the comments in the base class CMRegion defined in
+/// llvm/Transform/Scalar.
+///
+/// Function added for the GenXRegion
+///
+/// * Construct from a rdregion/wrregion intrinsic, setting the GenXRegion
+///   to the region described by the intrinsic. This constructor also takes the
+///   BaleInfo as an argument, allowing a variable index that is a baled in
+///   constant add to be considered as a separate variable index and constant
+///   offset.
+/// 
+/// GenXLegalization uses GenXRegion to determine whether a region is legal, 
+/// and split it up if necessary. First it constructs a GenXRegion, then it
+/// has a loop to split it into legal regions. Each loop iteration calls:
+///
+/// * the getLegalSize method (see below) to determine the split size; then
+/// * getSubregion to modify the GenXRegion for the split size; then
+/// * one of the methods to create a new rdregion or wrregion intrinsic.
+///
+/// GenXRegion::getLegalSize
+/// ^^^^^^^^^^^^^^^^^^^^^^^^
+/// 
+/// The ``getLegalSize`` method is used by GenXLegalization and some other
+/// passes to determine whether a region is legal, and if not how small
+/// a split is required to make it legal.
+/// 
+/// It takes the GenXSubtarget as an argument, because it needs to know
+/// architecture-specific details, currently just whether a single GRF 
+/// crossing is allowed in an indirect region.
+/// 
+/// It also takes either an AlignmentInfo object, or the actual alignment
+/// of the indirect index (if any). Knowing the alignment of the indirect
+/// index can help allow a larger legal region, and avoid needing to split
+/// into simd1.
+/// 
+//===----------------------------------------------------------------------===//
+
+#ifndef GENXREGION_H
+#define GENXREGION_H
+
+#include "GenXAlignmentInfo.h"
+#include "vc/GenXOpts/Utils/CMRegion.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallBitVector.h"
+
+namespace llvm {
+    class Constant;
+    class DataLayout;
+    class Value;
+    class Function;
+    class GenXBaling;
+    class GenXSubtarget;
+    class Module;
+    class Type;
+    class Instruction;
+    class raw_ostream;
+    class Twine;
+    class DebugLoc;
+    class TargetLibraryInfo;
+
+namespace genx {
+    struct BaleInfo;
+
+// Region : description of an operand's region
+class Region : public CMRegion {
+public:
+  static Region getWithOffset(Instruction *Inst, bool WantParentWith = false);
+  // Default constructor: assume single element
+  Region() : CMRegion() {}
+  // Construct from a type.
+  Region(Type *Ty, const DataLayout *DL = nullptr) : CMRegion(Ty, DL) {};
+  // Construct from a value.
+  Region(Value *V, const DataLayout *DL = nullptr) : CMRegion(V, DL) {};
+  // Construct from a rd/wr region/element and its BaleInfo
+  Region(Instruction *Inst, const BaleInfo &BI, bool WantParentWidth = false);
+  // Construct from a bitmap of which elements to set (legal 1D region)
+  Region(unsigned Bits, unsigned ElementBytes)
+    : CMRegion(Bits, ElementBytes) {};
+  // getLegalSize : get the max legal size of a region
+  unsigned getLegalSize(unsigned Idx, bool Allow2D, unsigned InputNumElements,
+                        const GenXSubtarget *ST, AlignmentInfo *AI = nullptr);
+  unsigned getLegalSize(unsigned Idx, bool Allow2D, unsigned InputNumElements,
+                        const GenXSubtarget *ST, Alignment Align);
+};
+
+// RdWrRegionSequence : a sequence of rdregion-wrregion pairs probably
+// created by legalization or coalescing, conforming to the following
+// rules:
+//
+// 1. It is a sequence of wrregions, each one (other than the last)
+//    having the next one's "old value" input as its only use.
+//
+// 2. Each wrregion's "new value" input is a single-use rdregion.
+//
+// 3. All the rdregions have the same "old value" input.
+//
+// 4. If the rdregions have a variable index, the index is the same for each
+//    one, other than the constant offset from a baled in genx.add.addr.
+//
+// 5. The rdregion regions are such that they can be combined to give the
+//    region parameters of the original unsplit rdregion. Those rdregion
+//    parameters are stored in the RdR field.
+//
+// 6. If the wrregions have a variable index, the index is the same for each
+//    one, other than the constant offset from a baled in genx.add.addr.
+//
+// 7. The wrregion regions are such that they can be combined to give the
+//    region parameters of the original unsplit wrregion. Those wrregion
+//    parameters are stored in the WrR field.
+//
+// Alternatively, a RdWrRegionSequence can represent a sequence of wrregion
+// instructions with undef "old value" input to the first one and constant
+// "new value" input to each one, forming a legalized constant load.
+//
+class RdWrRegionSequence {
+  Instruction *WaitingFor = nullptr;
+public:
+  Value *Input = nullptr;
+  Value *OldVal = nullptr;
+  Instruction *StartWr = nullptr;
+  Instruction *EndWr = nullptr;
+  Region RdR;
+  Region WrR;
+  // Default constructor
+  RdWrRegionSequence() : Input(nullptr), EndWr(nullptr) {}
+  // isNull : true if the RdWrRegionSequence has not been initialized
+  bool isNull() const { return !EndWr && !Input; }
+  // Scan for sequence from the start wrregion instruction.
+  // Returns false if not even a single rdregion-wrregion pair found.
+  bool buildFromStartWr(Instruction *Wr, GenXBaling *Baling);
+  // Scan for sequence from any wrregion instruction in the sequence.
+  // Returns false if not even a single rdregion-wrregion pair found.
+  bool buildFromWr(Instruction *Wr, GenXBaling *Baling);
+  // Scan for sequence from any rdregion instruction in the sequence.
+  // Returns false if not even a single rdregion-wrregion pair found.
+  bool buildFromRd(Instruction *Rd, GenXBaling *Baling);
+  // Get number of rdregion-wrregion pairs in the sequence
+  unsigned size() const;
+  // Check whether the sequence is the only use of its input
+  bool isOnlyUseOfInput() const;
+  // Get the index of the legalized rdregion
+  Value *getRdIndex() const;
+  // Get the index of the legalized wrregion
+  Value *getWrIndex() const;
+  // Get some use of Input in the sequence
+  Use *getInputUse() const;
+  // Debug dump/print
+  void dump() const;
+  void print(raw_ostream &OS) const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const RdWrRegionSequence &RWS) {
+  RWS.print(OS);
+  return OS;
+}
+
+Value *simplifyRegionInst(Instruction *Inst, const DataLayout *DL);
+bool simplifyRegionInsts(Function *F, const DataLayout *DL);
+
+bool cleanupLoads(Function *F);
+
+bool IsLinearVectorConstantInts(Value* v, int64_t& start, int64_t& stride);
+
+} // end namespace genx
+
+} // end namespace llvm
+
+#endif /* GENXREGION_H */
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegionCollapsing.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegionCollapsing.cpp
new file mode 100644
index 000000000000..9bbdce584635
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegionCollapsing.cpp
@@ -0,0 +1,1460 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXRegionCollapsing
+/// --------------------
+///
+/// GenX region collapsing pass is function pass that collapses nested
+/// read regions or nested write regions.
+///
+/// Nested region accesses can occur in two ways (or a mixture of both):
+///
+/// 1. The front end compiler deliberately generates nested region access. The
+///    CM compiler does this for a matrix select, generating a region access for
+///    the rows and another one for the columns, safe in the knowledge that this
+///    pass will combine them where it can.
+///
+/// 2. Two region accesses in different source code constructs (e.g. two select()
+///    calls, either in the same or different source statements).
+///
+/// The combineRegions() function is what makes the decisions on whether two
+/// regions can be collapsed, depending on whether they are 1D or 2D, how the
+/// rows of one fit in the rows of the other, whether each is indirect, etc.
+///
+/// This pass makes an effort to combine two region accesses even if there are
+/// multiple bitcasts (from CM format()) or up to one SExt/ZExt (from a cast) in
+/// between.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_RegionCollapsing"
+
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// GenX region collapsing pass
+class GenXRegionCollapsing : public FunctionPass {
+  const DataLayout *DL = nullptr;
+  DominatorTree *DT = nullptr;
+  bool Modified = false;
+public:
+  static char ID;
+  explicit GenXRegionCollapsing() : FunctionPass(ID) { }
+  virtual StringRef getPassName() const { return "GenX Region Collapsing"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
+  }
+  bool runOnFunction(Function &F);
+private:
+  void runOnBasicBlock(BasicBlock *BB);
+  void processBitCast(BitCastInst *BC);
+  void processRdRegion(Instruction *InnerRd);
+  void splitReplicatingIndirectRdRegion(Instruction *Rd, Region *R);
+  void processWrRegionElim(Instruction *OuterWr);
+  Instruction *processWrRegionBitCast(Instruction *WrRegion);
+  void processWrRegionBitCast2(Instruction *WrRegion);
+  Instruction *processWrRegion(Instruction *OuterWr);
+  Instruction *processWrRegionSplat(Instruction *OuterWr);
+  bool normalizeElementType(Region *R1, Region *R2, bool PreferFirst = false);
+  bool combineRegions(const Region *OuterR, const Region *InnerR,
+                      Region *CombinedR);
+  void calculateIndex(const Region *OuterR, const Region *InnerR,
+                      Region *CombinedR, Value *InnerIndex, const Twine &Name,
+                      Instruction *InsertBefore, const DebugLoc &DL);
+  Value *insertOp(Instruction::BinaryOps Opcode, Value *Lhs, unsigned Rhs,
+                  const Twine &Name, Instruction *InsertBefore,
+                  const DebugLoc &DL);
+  Value *insertOp(Instruction::BinaryOps Opcode, Value *Lhs, Value *Rhs,
+                  const Twine &Name, Instruction *InsertBefore,
+                  const DebugLoc &DL);
+};
+
+}// end namespace llvm
+
+
+char GenXRegionCollapsing::ID = 0;
+namespace llvm { void initializeGenXRegionCollapsingPass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXRegionCollapsing, "GenXRegionCollapsing",
+                      "GenXRegionCollapsing", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GenXRegionCollapsing, "GenXRegionCollapsing",
+                    "GenXRegionCollapsing", false, false)
+
+// Publicly exposed interface to pass...
+FunctionPass *llvm::createGenXRegionCollapsingPass()
+{
+  initializeGenXRegionCollapsingPass(*PassRegistry::getPassRegistry());
+  return new GenXRegionCollapsing();
+}
+
+/***********************************************************************
+ * runOnFunction : run the region collapsing pass for this Function
+ */
+bool GenXRegionCollapsing::runOnFunction(Function &F)
+{
+  DL = &F.getParent()->getDataLayout();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  // Track if there is any modification to the function.
+  bool Changed = false;
+
+  // This does a postordered depth first traversal of the CFG, processing
+  // instructions within a basic block in reverse, to ensure that we see a def
+  // after its uses (ignoring phi node uses).
+  for (po_iterator<BasicBlock *> i = po_begin(&F.getEntryBlock()),
+                                 e = po_end(&F.getEntryBlock());
+       i != e; ++i) {
+    // Iterate until there is no modification.
+    BasicBlock *BB = *i;
+    do {
+      Modified = false;
+      runOnBasicBlock(BB);
+      if (Modified)
+        Changed = true;
+    } while (Modified);
+  }
+
+  return Changed;
+}
+
+static bool lowerTrunc(TruncInst *Inst) {
+  Value *InValue = Inst->getOperand(0);
+  if (!GenXIntrinsic::isRdRegion(InValue))
+    return false;
+
+  Type *InElementTy = InValue->getType();
+  Type *OutElementTy = Inst->getType();
+  unsigned NumElements = 1;
+  if (VectorType *VT = dyn_cast<VectorType>(InElementTy)) {
+    InElementTy = VT->getElementType();
+    OutElementTy = cast<VectorType>(OutElementTy)->getElementType();
+    NumElements = VT->getNumElements();
+  }
+  unsigned OutBitSize = OutElementTy->getPrimitiveSizeInBits();
+  assert(OutBitSize);
+  // Do not touch truncations to i1 or vector of i1 types.
+  if (OutBitSize == 1)
+    return false;
+  unsigned Stride = InElementTy->getPrimitiveSizeInBits() / OutBitSize;
+
+  // Create the new bitcast.
+  Instruction *BC =
+      CastInst::Create(Instruction::BitCast, InValue,
+                       VectorType::get(OutElementTy, Stride * NumElements),
+                       Inst->getName(), Inst /*InsertBefore*/);
+  BC->setDebugLoc(Inst->getDebugLoc());
+
+  // Create the new rdregion.
+  Region R(BC);
+  R.NumElements = NumElements;
+  R.Stride = Stride;
+  R.Width = NumElements;
+  R.VStride = R.Stride * R.Width;
+  Instruction *NewInst = R.createRdRegion(
+      BC, Inst->getName(), Inst /*InsertBefore*/, Inst->getDebugLoc(),
+      !isa<VectorType>(Inst->getType()) /*AllowScalar*/);
+
+  // Change uses and mark the old inst for erasing.
+  Inst->replaceAllUsesWith(NewInst);
+  return true;
+}
+
+void GenXRegionCollapsing::runOnBasicBlock(BasicBlock *BB) {
+  // Code simplification in block first.
+  for (auto BI = BB->begin(), E = --BB->end(); BI != E;) {
+    assert(!BI->isTerminator());
+    Instruction *Inst = &*BI++;
+    if (Inst->use_empty())
+      continue;
+
+    // Turn trunc into bitcast followed by rdr. This helps region collapsing in
+    // a later stage.
+    if (auto TI = dyn_cast<TruncInst>(Inst)) {
+      Modified |= lowerTrunc(TI);
+      continue;
+    }
+
+    // Simplify
+    // %1 = call <1 x i32> @rdr(...)
+    // %2 = extractelement <1 x i32> %1, i32 0
+    // into
+    // %2 = call i32 @rdr(...)
+    //
+    if (auto EEI = dyn_cast<ExtractElementInst>(Inst)) {
+      Value *Src = EEI->getVectorOperand();
+      if (GenXIntrinsic::isRdRegion(Src) && Src->getType()->getVectorNumElements() == 1) {
+        // Create a new region with scalar output.
+        Region R(Inst);
+        Instruction *NewInst =
+            R.createRdRegion(Src, Inst->getName(), Inst /*InsertBefore*/,
+                             Inst->getDebugLoc(), true /*AllowScalar*/);
+        Inst->replaceAllUsesWith(NewInst);
+        Modified = true;
+        continue;
+      }
+    }
+
+    if (Value *V = simplifyRegionInst(Inst, DL)) {
+      Inst->replaceAllUsesWith(V);
+      Modified = true;
+      continue;
+    }
+
+    // sink index calculation before region collapsing. For collapsed regions,
+    // it is more difficult to lift constant offsets.
+    static const unsigned NOT_INDEX = 255;
+    unsigned Index = NOT_INDEX;
+
+    unsigned IID = GenXIntrinsic::getGenXIntrinsicID(Inst);
+    if (GenXIntrinsic::isRdRegion(IID))
+      Index = GenXIntrinsic::GenXRegion::RdIndexOperandNum;
+    else if (GenXIntrinsic::isWrRegion(IID))
+      Index = GenXIntrinsic::GenXRegion::WrIndexOperandNum;
+    else if (isa<InsertElementInst>(Inst))
+      Index = 2;
+    else if (isa<ExtractElementInst>(Inst))
+      Index = 1;
+
+    if (Index != NOT_INDEX) {
+      Use *U = &Inst->getOperandUse(Index);
+      Value *V = sinkAdd(*U);
+      if (V != U->get()) {
+        *U = V;
+        Modified = true;
+      }
+    }
+  }
+  Modified |= SimplifyInstructionsInBlock(BB);
+
+  // This loop processes instructions in reverse, tolerating an instruction
+  // being removed during its processing, and not re-processing any new
+  // instructions added during the processing of an instruction.
+  for (Instruction *Prev = BB->getTerminator(); Prev;) {
+    Instruction *Inst = Prev;
+    Prev = nullptr;
+    if (Inst != &BB->front())
+      Prev = Inst->getPrevNode();
+    switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf:
+      processRdRegion(Inst);
+      break;
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+      processWrRegionElim(Inst);
+      if (!Inst->use_empty()) {
+        if (auto NewInst = processWrRegionBitCast(Inst)) {
+          Modified = true;
+          Inst = NewInst;
+        }
+        auto NewInst1 = processWrRegionSplat(Inst);
+        if (Inst != NewInst1) {
+          Modified = true;
+          Inst = NewInst1;
+        }
+
+        auto NewInst = processWrRegion(Inst);
+        processWrRegionBitCast2(NewInst);
+        if (Inst != NewInst && NewInst->use_empty()) {
+          NewInst->eraseFromParent();
+          Modified = true;
+        }
+      }
+      if (Inst->use_empty()) {
+        Inst->eraseFromParent();
+        Modified = true;
+      }
+      break;
+    default:
+      if (auto BC = dyn_cast<BitCastInst>(Inst))
+        processBitCast(BC);
+      if (isa<CastInst>(Inst) && Inst->use_empty()) {
+        // Remove bitcast that has become unused due to changes in this pass.
+        Inst->eraseFromParent();
+        Modified = true;
+      }
+      break;
+    }
+  }
+}
+
+/***********************************************************************
+ * createBitCast : create a bitcast, combining bitcasts where applicable
+ */
+static Value *createBitCast(Value *Input, Type *Ty, const Twine &Name,
+                            Instruction *InsertBefore, const DebugLoc &DL) {
+  if (Input->getType() == Ty)
+    return Input;
+  if (auto BC = dyn_cast<BitCastInst>(Input))
+    Input = BC->getOperand(0);
+  if (Input->getType() == Ty)
+    return Input;
+  auto NewBC = CastInst::Create(Instruction::BitCast, Input, Ty,
+      Name, InsertBefore);
+  NewBC->setDebugLoc(DL);
+  return NewBC;
+}
+
+/***********************************************************************
+ * createBitCastToElementType : create a bitcast to a vector with the
+ *    specified element type, combining bitcasts where applicable
+ */
+static Value *createBitCastToElementType(Value *Input, Type *ElementTy,
+                                         const Twine &Name,
+                                         Instruction *InsertBefore,
+                                         const DataLayout *DL,
+                                         const DebugLoc &DbgLoc) {
+  unsigned ElBytes = ElementTy->getPrimitiveSizeInBits() / 8U;
+  if (!ElBytes) {
+    assert(ElementTy->isPointerTy() && ElementTy->getPointerElementType()->isFunctionTy());
+    ElBytes = DL->getTypeSizeInBits(ElementTy) / 8;
+  }
+  unsigned InputBytes = Input->getType()->getPrimitiveSizeInBits() / 8U;
+  if (!InputBytes) {
+    Type *T = Input->getType();
+    if (T->isVectorTy())
+      T = T->getVectorElementType();
+    assert(T->isPointerTy() && T->getPointerElementType()->isFunctionTy());
+    InputBytes = DL->getTypeSizeInBits(T) / 8;
+  }
+  assert(!(InputBytes & (ElBytes - 1)) && "non-integral number of elements");
+  auto Ty = VectorType::get(ElementTy, InputBytes / ElBytes);
+  return createBitCast(Input, Ty, Name, InsertBefore, DbgLoc);
+}
+
+/***********************************************************************
+ * combineBitCastWithUser : if PossibleBC is a bitcast, and it has a single
+ *    user that is also a bitcast, then combine them
+ *
+ * If combined, the two bitcast instructions are erased.
+ *
+ * This can happen because combining two rdregions with a bitcast between
+ * them can result in the bitcast being used by another bitcast that was
+ * already there.
+ */
+static void combineBitCastWithUser(Value *PossibleBC)
+{
+  if (auto BC1 = dyn_cast<BitCastInst>(PossibleBC)) {
+    if (BC1->hasOneUse()) {
+      if (auto BC2 = dyn_cast<BitCastInst>(BC1->use_begin()->getUser())) {
+        Value *CombinedBC = BC1->getOperand(0);
+        if (CombinedBC->getType() != BC2->getType())
+          CombinedBC = createBitCast(BC1->getOperand(0), BC2->getType(),
+              BC2->getName(), BC2, BC2->getDebugLoc());
+        BC2->replaceAllUsesWith(CombinedBC);
+        BC2->eraseFromParent();
+        BC1->eraseFromParent();
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * processBitCast : process a bitcast whose input is rdregion
+ *
+ * We put the bitcast before the rdregion, in the hope that it will enable
+ * the rdregion to be baled in to something later on.
+ */
+void GenXRegionCollapsing::processBitCast(BitCastInst *BC)
+{
+  if (BC->getType()->getScalarType()->isIntegerTy(1))
+    return;
+  auto Rd = dyn_cast<Instruction>(BC->getOperand(0));
+
+  // check if skipping this optimization.
+  auto skip = [=] {
+    // Skip if this is not rdregion
+    if (!Rd || !GenXIntrinsic::isRdRegion(Rd))
+      return true;
+
+    // Single use, do optimization.
+    if (Rd->hasOneUse())
+      return false;
+
+    // More than one uses, we check if rdr is reading from a global.
+    // If yes, still do such conversion, as bitcast could be folded into g_load.
+    Value *Op0 = Rd->getOperand(0);
+    while (auto CI = dyn_cast<BitCastInst>(Op0))
+      Op0 = CI->getOperand(0);
+    auto LI = dyn_cast<LoadInst>(Op0);
+    if (LI && getUnderlyingGlobalVariable(LI->getPointerOperand()))
+      return false;
+
+    // skip otherwise;
+    return true;
+  };
+
+  if (skip())
+    return;
+
+  // skip call above shall check for RdRegion among other things
+  assert(Rd && GenXIntrinsic::isRdRegion(Rd));
+
+  // We have a single use rdregion as the input to the bitcast.
+  // Adjust the region parameters if possible so the element type is that of
+  // the result of the bitcast, instead of the input.
+  Region ROrig(Rd, BaleInfo());
+  Region R(Rd, BaleInfo());
+  auto ElTy = BC->getType()->getScalarType();
+  if (!R.changeElementType(ElTy))
+    return;
+
+  // we do not want this optimization to be applied if resulting indirect
+  // region will have non-zero stride or non-single width
+  // this will require ineffective legalization in those cases
+  bool OrigCorr = ((ROrig.Width == 1) || (ROrig.Stride == 0));
+  bool ChangedWrong = ((R.Width != 1) && (R.Stride != 0));
+  if (OrigCorr && ChangedWrong && R.Indirect)
+    return;
+
+  // Create the new bitcast.
+  assert(ElTy->getPrimitiveSizeInBits());
+  auto Input = Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+  auto NewBCTy = VectorType::get(ElTy,
+      Input->getType()->getPrimitiveSizeInBits() / ElTy->getPrimitiveSizeInBits());
+  auto NewBC = CastInst::Create(Instruction::BitCast, Input, NewBCTy, "", Rd);
+  NewBC->takeName(BC);
+  NewBC->setDebugLoc(BC->getDebugLoc());
+  // Create the new rdregion.
+  auto NewRd = R.createRdRegion(NewBC, "", Rd, Rd->getDebugLoc(),
+      /*AllowScalar=*/!isa<VectorType>(BC->getType()));
+  NewRd->takeName(Rd);
+  // Replace uses.
+  BC->replaceAllUsesWith(NewRd);
+  // Caller removes BC.
+  Modified = true;
+}
+
+/***********************************************************************
+ * processRdRegion : process a rdregion
+ *
+ * 1. If this rdregion is unused, it probably became so in the processing
+ *    of a later rdregion. Erase it.
+ *
+ * 2. Otherwise, see if the input to this rdregion is the result of
+ *    an earlier rdregion, and if so see if they can be combined. This can
+ *    work even if there are bitcasts and up to one sext/zext between the
+ *    two rdregions.
+ */
+void GenXRegionCollapsing::processRdRegion(Instruction *InnerRd)
+{
+  if (InnerRd->use_empty()) {
+    InnerRd->eraseFromParent();
+    Modified = true;
+    return;
+  }
+
+  // We use Region::getWithOffset to get a Region object for a rdregion/wrregion
+  // throughout this pass, in order to ensure that, with an index that is
+  // V+const, we get the V and const separately (in Region::Indirect and
+  // Region::Offset). Then our index calculations can ensure that the constant
+  // add remains the last thing that happens in the calculation.
+  Region InnerR = Region::getWithOffset(InnerRd, /*WantParentWidth=*/true);
+
+  // Prevent region collapsing for specific src replication pattern,
+  // in order to enable swizzle optimization for Align16 instruction
+  if (InnerRd->hasOneUse()) {
+    if (auto UseInst = dyn_cast<Instruction>(InnerRd->use_begin()->getUser())) {
+      if (UseInst->getOpcode() == Instruction::FMul) {
+        auto NextInst = dyn_cast<Instruction>(UseInst->use_begin()->getUser());
+        if (NextInst &&
+            (NextInst->getOpcode() == Instruction::FAdd ||
+             NextInst->getOpcode() == Instruction::FSub) &&
+          InnerR.ElementTy->getPrimitiveSizeInBits() == 64U &&
+          InnerR.Width == 2 &&
+          InnerR.Stride == 0 &&
+          InnerR.VStride == 2)
+          return;
+      }
+    }
+  }
+
+  for (;;) {
+    Instruction *OuterRd = dyn_cast<Instruction>(InnerRd->getOperand(0));
+    // Go through any bitcasts and up to one sext/zext if necessary to find the
+    // outer rdregion.
+    Instruction *Extend = nullptr;
+    bool HadElementTypeChange = false;
+    for (;;) {
+      if (!OuterRd)
+        break; // input not result of earlier rdregion
+      if (GenXIntrinsic::isRdRegion(OuterRd))
+        break; // found the outer rdregion
+      if (isa<SExtInst>(OuterRd) || isa<ZExtInst>(OuterRd)) {
+        if (OuterRd->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) {
+          OuterRd = nullptr;
+          break; // input not result of earlier rdregion
+        }
+        if (Extend || HadElementTypeChange) {
+          OuterRd = nullptr;
+          break; // can only have one sext/zext between the rdregions, and
+                 // sext/zext not allowed if it is then subject to a bitcast
+                 // that changes the element type
+        }
+        // Remember the sext/zext instruction.
+        Extend = OuterRd;
+      } else if (isa<BitCastInst>(OuterRd)) {
+        if (OuterRd->getType()->getScalarType()
+            != OuterRd->getOperand(0)->getType()->getScalarType())
+          HadElementTypeChange = true;
+      } else {
+        OuterRd = nullptr;
+        break; // input not result of earlier rdregion
+      }
+      OuterRd = dyn_cast<Instruction>(OuterRd->getOperand(0));
+    }
+    if (!OuterRd)
+      break; // no outer rdregion that we can combine with
+    Region OuterR = Region::getWithOffset(OuterRd);
+    // There was a sext/zext. Because we are going to put that after the
+    // collapsed region, we want to modify the inner region to the
+    // extend's input element type without changing the region parameters
+    // (other than scaling the offset). We know that there is no element
+    // type changing bitcast between the extend and the inner rdregion.
+    if (Extend) {
+      if (InnerR.Indirect)
+        return; // cannot cope with indexed inner region and sext/zext
+      InnerR.ElementTy = Extend->getOperand(0)->getType()->getScalarType();
+      unsigned ExtInputElementBytes
+            = InnerR.ElementTy->getPrimitiveSizeInBits() / 8U;
+      InnerR.Offset = InnerR.Offset / InnerR.ElementBytes * ExtInputElementBytes;
+      InnerR.ElementBytes = ExtInputElementBytes;
+    }
+    // See if the regions can be combined. We call normalizeElementType with
+    // InnerR as the first arg so it prefers to normalize to that region's
+    // element type if possible. That can avoid a bitcast being put after the
+    // combined rdregion, which can help baling later on.
+    LLVM_DEBUG(dbgs() << "GenXRegionCollapsing::processRdRegion:\n"
+        "  OuterRd (line " << OuterRd->getDebugLoc().getLine() << "): " << *OuterRd << "\n"
+        "  InnerRd (line " << InnerRd->getDebugLoc().getLine() << "): " << *InnerRd << "\n");
+    if (!normalizeElementType(&InnerR, &OuterR, /*PreferFirst=*/true)) {
+      LLVM_DEBUG(dbgs() << "Cannot normalize element type\n");
+      return;
+    }
+    Region CombinedR;
+    if (!combineRegions(&OuterR, &InnerR, &CombinedR))
+      return; // cannot combine
+
+    // If the combined region is both indirect and splat, then do not combine.
+    // Otherwise, this leads to an infinite loop as later on we split such
+    // region reads.
+    auto isIndirectSplat = [](const Region &R) {
+      if (!R.Indirect)
+        return false;
+      if (R.Width != R.NumElements && !R.VStride &&
+          !isa<VectorType>(R.Indirect->getType()))
+        return true;
+      if (R.Width == 1 || R.Stride)
+        return false;
+      return true;
+    };
+    if (isIndirectSplat(CombinedR))
+      return;
+
+    // Calculate index if necessary.
+    if (InnerR.Indirect) {
+      calculateIndex(&OuterR, &InnerR, &CombinedR,
+          InnerRd->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum),
+          InnerRd->getName() + ".indexcollapsed",
+          InnerRd, InnerRd->getDebugLoc());
+    }
+    // If the element type of the combined region does not match that of the
+    // outer region, we need to do a bitcast first.
+    Value *Input = OuterRd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+    // InnerR.ElementTy not always equal to InnerRd->getType()->getScalarType() (look above)
+    if (InnerR.ElementTy != OuterRd->getType()->getScalarType())
+      Input = createBitCastToElementType(Input, InnerR.ElementTy,
+                                         Input->getName() +
+                                             ".bitcast_before_collapse",
+                                         OuterRd, DL, OuterRd->getDebugLoc());
+    // Create the combined rdregion.
+    Instruction *CombinedRd = CombinedR.createRdRegion(Input,
+        InnerRd->getName() + ".regioncollapsed", InnerRd, InnerRd->getDebugLoc(),
+        !isa<VectorType>(InnerRd->getType()));
+    // If we went through sext/zext, re-instate it here.
+    Value *NewVal = CombinedRd;
+    if (Extend) {
+      auto NewCI = CastInst::Create((Instruction::CastOps)Extend->getOpcode(),
+          NewVal, InnerRd->getType(), Extend->getName(), InnerRd);
+      NewCI->setDebugLoc(Extend->getDebugLoc());
+      NewVal = NewCI;
+    }
+    // If we still don't have the right type due to bitcasts in the original
+    // code, add a bitcast here.
+    NewVal = createBitCast(NewVal, InnerRd->getType(),
+        NewVal->getName() + ".bitcast_after_collapse", InnerRd,
+        InnerRd->getDebugLoc());
+    // Replace the inner read with the new value, and erase the inner read.
+    // any other instructions between it and the outer read (inclusive) that
+    // become unused.
+    InnerRd->replaceAllUsesWith(NewVal);
+    InnerRd->eraseFromParent();
+    Modified = true;
+    // Check whether we just created a bitcast that can be combined with its
+    // user. If so, combine them.
+    combineBitCastWithUser(NewVal);
+    InnerRd = CombinedRd;
+    InnerR = Region::getWithOffset(InnerRd, /*WantParentWidth=*/true);
+    // Because the loop in runOnFunction does not re-process the new rdregion,
+    // loop back here to re-process it.
+  }
+  // InnerRd and InnerR are now the combined rdregion (or the original one if
+  // no combining was done).
+  // Check whether we have a rdregion that is both indirect and replicating,
+  // that we want to split.
+  splitReplicatingIndirectRdRegion(InnerRd, &InnerR);
+}
+
+/***********************************************************************
+ * splitReplicatingIndirectRdRegion : if the rdregion is both indirect and
+ *    replicating, split out the indirect part so it is read only once
+ */
+void GenXRegionCollapsing::splitReplicatingIndirectRdRegion(
+    Instruction *Rd, Region *R)
+{
+  if (!R->Indirect)
+    return;
+  if (R->Width != R->NumElements && !R->VStride
+      && !isa<VectorType>(R->Indirect->getType())) {
+    // Replicating rows. We want an indirect region that just reads
+    // one row
+    Region IndirR = *R;
+    IndirR.NumElements = IndirR.Width;
+    auto Indir = IndirR.createRdRegion(Rd->getOperand(0),
+        Rd->getName() + ".split_replicated_indir", Rd, Rd->getDebugLoc());
+    // ... and a direct region that replicates the row.
+    R->Indirect = nullptr;
+    R->Offset = 0;
+    R->Stride = 1;
+    auto NewRd = R->createRdRegion(Indir, "", Rd, Rd->getDebugLoc());
+    NewRd->takeName(Rd);
+    Rd->replaceAllUsesWith(NewRd);
+    Rd->eraseFromParent();
+    Modified = true;
+    return;
+  }
+  if (R->Width == 1 || R->Stride)
+    return;
+  // Replicating columns. We want an indirect region that just reads
+  // one column
+  Region IndirR = *R;
+  IndirR.NumElements = IndirR.NumElements / IndirR.Width;
+  IndirR.Width = 1;
+  auto Indir = IndirR.createRdRegion(Rd->getOperand(0),
+      Rd->getName() + ".split_replicated_indir", Rd, Rd->getDebugLoc());
+  // ... and a direct region that replicates the column.
+  R->Indirect = nullptr;
+  R->Offset = 0;
+  R->VStride = 1;
+  auto NewRd = R->createRdRegion(Indir, "", Rd, Rd->getDebugLoc());
+  NewRd->takeName(Rd);
+  Rd->replaceAllUsesWith(NewRd);
+  Rd->eraseFromParent();
+}
+
+/***********************************************************************
+ * processWrRegionElim : process a wrregion and eliminate redundant writes
+ *
+ * This detects the following code:
+ *
+ *   B = wrregion(A, V1, R)
+ *   C = wrregion(B, V2, R)
+ *
+ * (where "R" is a region that is identical in the two versions
+ * this can be collapsed to
+ *
+ *   D = wrregion(A, V2, R)
+ *
+ */
+void GenXRegionCollapsing::processWrRegionElim(Instruction *OuterWr)
+{
+  auto InnerWr = dyn_cast<Instruction>(
+      OuterWr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+  if (!GenXIntrinsic::isWrRegion(InnerWr))
+    return;
+  // Only perform this optimisation if the only use is with outer - otherwise
+  // this seems to make the code spill more
+  assert(InnerWr);
+  if (!InnerWr->hasOneUse())
+    return;
+  Region InnerR(InnerWr, BaleInfo(), /*WantParentWidth=*/true);
+  Region OuterR(OuterWr, BaleInfo());
+  if (OuterR != InnerR)
+    return;
+  // Create the combined wrregion.
+  Instruction *CombinedWr = cast<Instruction>(OuterR.createWrRegion(
+      InnerWr->getOperand(0),
+      OuterWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum),
+      OuterWr->getName() + ".regioncollapsed", OuterWr,
+      OuterWr->getDebugLoc()));
+  OuterWr->replaceAllUsesWith(CombinedWr);
+  // Do not erase OuterWr here -- it gets erased by the caller.
+  Modified = true;
+}
+
+/***********************************************************************
+ * processWrRegionBitCast : handle a wrregion whose "new value" is a
+ *      bitcast (before processing wrregion for region collapsing)
+ *
+ * Enter:   Inst = the wrregion
+ *
+ * Return:  replacement wrregion if any, else 0
+ *
+ * If the "new value" operand of the wrregion is a bitcast from scalar to
+ * 1-vector, or vice versa, then we can replace the wrregion with one that
+ * uses the input to the bitcast directly. This may enable later baling
+ * that would otherwise not happen.
+ *
+ * The bitcast typically arises from GenXLowering lowering an insertelement.
+ */
+Instruction *GenXRegionCollapsing::processWrRegionBitCast(Instruction *WrRegion)
+{
+  assert(GenXIntrinsic::isWrRegion(WrRegion));
+  if (auto BC = dyn_cast<BitCastInst>(WrRegion->getOperand(
+          GenXIntrinsic::GenXRegion::NewValueOperandNum))) {
+    if (BC->getType()->getScalarType()
+        == BC->getOperand(0)->getType()->getScalarType()) {
+      // The bitcast is from scalar to 1-vector, or vice versa.
+      Region R(WrRegion, BaleInfo());
+      auto NewInst = cast<Instruction>(R.createWrRegion(WrRegion->getOperand(0),
+            BC->getOperand(0), "", WrRegion, WrRegion->getDebugLoc()));
+      NewInst->takeName(WrRegion);
+      WrRegion->replaceAllUsesWith(NewInst);
+      WrRegion->eraseFromParent();
+      return NewInst;
+    }
+  }
+  return nullptr;
+}
+
+/***********************************************************************
+ * processWrRegionBitCast2 : handle a wrregion whose "new value" is a
+ *      bitcast (after processing wrregion for region collapsing)
+ *
+ * Enter:   WrRegion = the wrregion
+ *
+ * This does not erase WrRegion even if it becomes unused.
+ *
+ *
+ * If the "new value" operand of the wrregion is some other bitcast, then we
+ * change the wrregion to the pre-bitcast type and add new bitcasts for the
+ * "old value" input and the result. This makes it possible for the new value
+ * to be baled in to the wrregion.
+ */
+void GenXRegionCollapsing::processWrRegionBitCast2(Instruction *WrRegion)
+{
+  auto BC = dyn_cast<BitCastInst>(WrRegion->getOperand(
+        GenXIntrinsic::GenXRegion::NewValueOperandNum));
+  if (!BC)
+    return;
+  Type *BCInputElementType = BC->getOperand(0)->getType()->getScalarType();
+  if (BCInputElementType->isIntegerTy(1))
+    return;
+  // Get the region params for the replacement wrregion, checking if that
+  // fails.
+  Region R(WrRegion, BaleInfo());
+  if (!R.changeElementType(BCInputElementType))
+    return;
+  // Bitcast the "old value" input.
+  Value *OldVal = createBitCastToElementType(
+      WrRegion->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum),
+      BCInputElementType, WrRegion->getName() + ".precast", WrRegion, DL,
+      WrRegion->getDebugLoc());
+  // Create the replacement wrregion.
+  auto NewInst = cast<Instruction>(R.createWrRegion(OldVal, BC->getOperand(0),
+        "", WrRegion, WrRegion->getDebugLoc()));
+  NewInst->takeName(WrRegion);
+  // Cast it.
+  Value *Res = createBitCast(NewInst, WrRegion->getType(),
+      WrRegion->getName() + ".postcast", WrRegion, WrRegion->getDebugLoc());
+  WrRegion->replaceAllUsesWith(Res);
+}
+
+static bool hasMemoryDeps(CallInst *L1, CallInst *L2, Value *Addr,
+                          DominatorTree *DT) {
+
+  auto isKill = [=](Instruction &I) {
+    Instruction *Inst = &I;
+    if (GenXIntrinsic::isVStore(Inst) &&
+        (Addr == Inst->getOperand(1) ||
+         Addr == getUnderlyingGlobalVariable(Inst->getOperand(1))))
+      return true;
+    // OK.
+    return false;
+  };
+
+  // vloads from the same block.
+  if (L1->getParent() == L2->getParent()) {
+    BasicBlock::iterator I = L1->getParent()->begin();
+    for (; &*I != L1 && &*I != L2; ++I)
+      /*empty*/;
+    assert(&*I == L1 || &*I == L2);
+    auto IEnd = (&*I == L1) ? L2->getIterator() : L1->getIterator();
+    return std::any_of(I->getIterator(), IEnd, isKill);
+  }
+
+  // vloads are from different blocks.
+  //
+  //       BB1 (L1)
+  //      /   \
+  //   BB3    BB2 (L2)
+  //     \     /
+  //       BB4
+  //
+  auto BB1 = L1->getParent();
+  auto BB2 = L2->getParent();
+  if (!DT->properlyDominates(BB1, BB2)) {
+    std::swap(BB1, BB2);
+    std::swap(L1, L2);
+  }
+  if (DT->properlyDominates(BB1, BB2)) {
+    // As BB1 dominates BB2, we can recursively check BB2's predecessors, until
+    // reaching BB1.
+    //
+    // check BB1 && BB2
+    if (std::any_of(BB2->begin(), L2->getIterator(), isKill))
+      return true;
+    if (std::any_of(L1->getIterator(), BB1->end(), isKill))
+      return true;
+    std::set<BasicBlock *> Visited{BB1, BB2};
+    std::vector<BasicBlock *> BBs;
+    for (auto I = pred_begin(BB2), E = pred_end(BB2); I != E; ++I) {
+      BasicBlock *BB = *I;
+      if (!Visited.count(BB))
+        BBs.push_back(BB);
+    }
+
+    // This visits the subgraph dominated by BB1, originated from BB2.
+    while (!BBs.empty()) {
+      BasicBlock *BB = BBs.back();
+      BBs.pop_back();
+      Visited.insert(BB);
+
+      // check if there is any store kill in this block.
+      if (std::any_of(BB->begin(), BB->end(), isKill))
+        return true;
+
+      // Populate not visited predecessors.
+      for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I)
+        if (!Visited.count(*I))
+          BBs.push_back(*I);
+    }
+
+    // no mem deps.
+    return false;
+  }
+
+  return true;
+}
+
+// Check whether two values are bitwise identical.
+static bool isBitwiseIdentical(Value *V1, Value *V2, DominatorTree *DT) {
+  assert(V1 && V2 && "null value");
+  if (V1 == V2)
+    return true;
+  if (BitCastInst *BI = dyn_cast<BitCastInst>(V1))
+    V1 = BI->getOperand(0);
+  if (BitCastInst *BI = dyn_cast<BitCastInst>(V2))
+    V2 = BI->getOperand(0);
+
+  // Special case arises from vload/vstore.
+  if (GenXIntrinsic::isVLoad(V1) && GenXIntrinsic::isVLoad(V2)) {
+    auto L1 = cast<CallInst>(V1);
+    auto L2 = cast<CallInst>(V2);
+
+    // Loads from global variables.
+    auto GV1 = getUnderlyingGlobalVariable(L1->getOperand(0));
+    auto GV2 = getUnderlyingGlobalVariable(L2->getOperand(0));
+    Value *Addr = L1->getOperand(0);
+    if (GV1 && GV1 == GV2)
+      // OK.
+      Addr = GV1;
+    else if (L1->getOperand(0) != L2->getOperand(0))
+      // Check if loading from the same location.
+      return false;
+    else if (!isa<AllocaInst>(Addr))
+      // Check if this pointer is local and only used in vload/vstore.
+      return false;
+
+    // Check if there is no store to the same location in between.
+    return !hasMemoryDeps(L1, L2, Addr, DT);
+  }
+
+  // Cannot prove.
+  return false;
+}
+
+/***********************************************************************
+ * processWrRegion : process a wrregion
+ *
+ * Enter:   OuterWr = the wrregion instruction that we will attempt to use as
+ *                    the outer wrregion and collapse with inner ones
+ *
+ * Return:  the replacement wrregion if any, otherwise OuterWr
+ *
+ * This detects the following code:
+ *
+ *   B = rdregion(A, OuterR)
+ *   C = wrregion(B, V, InnerR)
+ *   D = wrregion(A, C, OuterR)
+ *
+ * (where "InnerR" and "OuterR" are the region parameters). This code can
+ * be collapsed to
+ *
+ *   D = wrregion(A, V, CombinedR)
+ *
+ * We want to do innermost wrregion combining first, but this pass visits
+ * instructions in the wrong order for that. So, when we see a wrregion
+ * here, we use recursion to scan back to find the innermost one and then work
+ * forwards to where we started.
+ */
+Instruction *GenXRegionCollapsing::processWrRegion(Instruction *OuterWr)
+{
+  assert(OuterWr);
+  // Find the inner wrregion, skipping bitcasts.
+  auto InnerWr = dyn_cast<Instruction>(
+      OuterWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+  while (InnerWr && isa<BitCastInst>(InnerWr))
+    InnerWr = dyn_cast<Instruction>(InnerWr->getOperand(0));
+  if (!GenXIntrinsic::isWrRegion(InnerWr))
+    return OuterWr;
+  // Process inner wrregions first, recursively.
+  InnerWr = processWrRegion(InnerWr);
+  // Now process this one.
+  // Find the associated rdregion of the outer region, skipping bitcasts,
+  // and check it has the right region parameters.
+  assert(InnerWr);
+  auto OuterRd = dyn_cast<Instruction>(InnerWr->getOperand(0));
+  while (OuterRd && isa<BitCastInst>(OuterRd))
+    OuterRd = dyn_cast<Instruction>(OuterRd->getOperand(0));
+  if (!GenXIntrinsic::isRdRegion(OuterRd))
+    return OuterWr;
+  assert(OuterRd);
+  if (!isBitwiseIdentical(OuterRd->getOperand(0), OuterWr->getOperand(0), DT))
+    return OuterWr;
+  Region InnerR = Region::getWithOffset(InnerWr, /*WantParentWidth=*/true);
+  Region OuterR = Region::getWithOffset(OuterWr);
+  if (OuterR != Region::getWithOffset(OuterRd))
+    return OuterWr;
+  // See if the regions can be combined.
+  LLVM_DEBUG(dbgs() << "GenXRegionCollapsing::processWrRegion:\n"
+      "  OuterWr (line " << OuterWr->getDebugLoc().getLine() << "): " << *OuterWr << "\n"
+      "  InnerWr (line " << InnerWr->getDebugLoc().getLine() << "): " << *InnerWr << "\n");
+  if (!normalizeElementType(&OuterR, &InnerR)) {
+    LLVM_DEBUG(dbgs() << "Cannot normalize element type\n");
+    return OuterWr;
+  }
+  Region CombinedR;
+  if (!combineRegions(&OuterR, &InnerR, &CombinedR))
+    return OuterWr; // cannot combine
+  // Calculate index if necessary.
+  if (InnerR.Indirect) {
+    calculateIndex(&OuterR, &InnerR, &CombinedR,
+        InnerWr->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum),
+        InnerWr->getName() + ".indexcollapsed", OuterWr, InnerWr->getDebugLoc());
+  }
+  // Bitcast inputs if necessary.
+  Value *OldValInput = OuterRd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+  OldValInput = createBitCastToElementType(OldValInput, InnerR.ElementTy,
+      OldValInput->getName() + ".bitcast_before_collapse", OuterWr, DL, OuterWr->getDebugLoc());
+  Value *NewValInput = InnerWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum);
+  NewValInput = createBitCastToElementType(NewValInput, InnerR.ElementTy,
+      NewValInput->getName() + ".bitcast_before_collapse", OuterWr, DL, OuterWr->getDebugLoc());
+  // Create the combined wrregion.
+  Instruction *CombinedWr = cast<Instruction>(CombinedR.createWrRegion(
+      OldValInput,
+      NewValInput,
+      InnerWr->getName() + ".regioncollapsed", OuterWr,
+      InnerWr->getDebugLoc()));
+  // Bitcast to the original type if necessary.
+  Value *Res = createBitCast(CombinedWr, OuterWr->getType(),
+      CombinedWr->getName() + ".cast", OuterWr,
+      InnerWr->getDebugLoc());
+  // Replace all uses.
+  OuterWr->replaceAllUsesWith(Res);
+  // Do not erase OuterWr here, as (if this function recursed to process an
+  // inner wrregion first) OuterWr might be the same as Prev in the loop in
+  // runOnFunction(). For a recursive call of processWrRegion, it will
+  // eventually get visited and then erased as it has no uses.  For an outer
+  // call of processWrRegion, OuterWr is erased by the caller.
+  Modified = true;
+  return CombinedWr;
+}
+
+/***********************************************************************
+ * processWrRegionSplat : process a wrregion
+ *
+ * Enter:   OuterWr = the wrregion instruction that we will attempt to use as
+ *                    the outer wrregion and collapse with inner ones
+ *
+ * Return:  the replacement wrregion if any, otherwise OuterWr
+ *
+ * This detects the following code:
+ *
+ *   C = wrregion(undef, V, InnerR)
+ *   D = wrregion(undef, C, OuterR)
+ *
+ * (where "InnerR" and "OuterR" are the region parameters). This code can
+ * be collapsed to
+ *
+ *   D = wrregion(undef, V, CombinedR)
+ *
+ * We want to do innermost wrregion combining first, but this pass visits
+ * instructions in the wrong order for that. So, when we see a wrregion
+ * here, we use recursion to scan back to find the innermost one and then work
+ * forwards to where we started.
+ */
+Instruction *GenXRegionCollapsing::processWrRegionSplat(Instruction *OuterWr)
+{
+  assert(OuterWr);
+  // Find the inner wrregion, skipping bitcasts.
+  auto InnerWr = dyn_cast<Instruction>(
+      OuterWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+  while (InnerWr && isa<BitCastInst>(InnerWr))
+    InnerWr = dyn_cast<Instruction>(InnerWr->getOperand(0));
+  if (!GenXIntrinsic::isWrRegion(InnerWr))
+    return OuterWr;
+  // Process inner wrregions first, recursively.
+  InnerWr = processWrRegionSplat(InnerWr);
+
+  // Now process this one.
+  auto InnerSrc = dyn_cast<Constant>(InnerWr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+  if (!InnerSrc)
+    return OuterWr;
+  // Ensure that the combined region is well-defined.
+  if (InnerSrc->getType()->getScalarSizeInBits() !=
+      OuterWr->getType()->getScalarSizeInBits())
+    return OuterWr;
+
+  auto OuterSrc = dyn_cast<Constant>(OuterWr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+  if (!OuterSrc)
+   return OuterWr;
+  if (isa<UndefValue>(InnerSrc)) {
+    // OK.
+  } else {
+    auto InnerSplat = InnerSrc->getSplatValue();
+    auto OuterSplat = OuterSrc->getSplatValue();
+    if (!InnerSplat || !OuterSplat || InnerSplat != OuterSplat)
+      return OuterWr;
+  }
+
+  Region InnerR = Region::getWithOffset(InnerWr, /*WantParentWidth=*/true);
+  Region OuterR = Region::getWithOffset(OuterWr);
+  Region CombinedR;
+  if (!combineRegions(&OuterR, &InnerR, &CombinedR))
+    return OuterWr; // cannot combine
+  // Calculate index if necessary.
+  if (InnerR.Indirect) {
+    calculateIndex(&OuterR, &InnerR, &CombinedR,
+        InnerWr->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum),
+        InnerWr->getName() + ".indexcollapsed", OuterWr, InnerWr->getDebugLoc());
+  }
+  // Bitcast inputs if necessary.
+  Value *OldValInput = OuterSrc;
+  Value *NewValInput = InnerWr->getOperand(1);
+  NewValInput = createBitCastToElementType(NewValInput, OuterWr->getType()->getScalarType(),
+      NewValInput->getName() + ".bitcast_before_collapse", OuterWr, DL, OuterWr->getDebugLoc());
+  // Create the combined wrregion.
+  Instruction *CombinedWr = cast<Instruction>(CombinedR.createWrRegion(
+      OldValInput,
+      NewValInput,
+      InnerWr->getName() + ".regioncollapsed", OuterWr,
+      InnerWr->getDebugLoc()));
+  // Bitcast to the original type if necessary.
+  Value *Res = createBitCast(CombinedWr, OuterWr->getType(),
+      CombinedWr->getName() + ".cast", OuterWr,
+      InnerWr->getDebugLoc());
+  // Replace all uses.
+  OuterWr->replaceAllUsesWith(Res);
+  // Do not erase OuterWr here, as (if this function recursed to process an
+  // inner wrregion first) OuterWr might be the same as Prev in the loop in
+  // runOnFunction(). For a recursive call of processWrRegionSplat, it will
+  // eventually get visited and then erased as it has no uses.  For an outer
+  // call of processWrRegionSplat, OuterWr is erased by the caller.
+  Modified = true;
+  return CombinedWr;
+}
+
+/***********************************************************************
+ * normalizeElementType : where two regions have different element size,
+ *      make them the same if possible
+ *
+ * Enter:   R1 = first region
+ *          R2 = second region
+ *          PreferFirst = true to prefer the first region's element type
+ *
+ * Return:  false if failed
+ *
+ * If PreferFirst is false, this uses the larger element size if everything is
+ * suitably aligned and the region with the smaller element size can be
+ * converted to the larger element size.
+ *
+ * Otherwise, it uses the smaller element size if the region with the
+ * larger element size can be converted to the smaller element size.
+ */
+bool GenXRegionCollapsing::normalizeElementType(Region *R1, Region *R2,
+      bool PreferFirst)
+{
+  if (R1->ElementBytes == R2->ElementBytes)
+    return true; // nothing to do
+  LLVM_DEBUG(dbgs() << "Before normalizeElementType:\n"
+        "  R1: " << *R1 << "\n"
+        "  R2: " << *R2 << "\n");
+  // Set BigR to the region with the bigger element size, and SmallR to the
+  // region with the smaller element size.
+  bool PreferSmall = false;
+  Region *BigR = nullptr, *SmallR = nullptr;
+  if (R1->ElementBytes > R2->ElementBytes) {
+    BigR = R1;
+    SmallR = R2;
+  } else {
+    BigR = R2;
+    SmallR = R1;
+    PreferSmall = PreferFirst;
+  }
+  // Try the smaller element size first if it is preferred by the caller.
+  if (PreferSmall)
+    if (!BigR->Indirect) // big region not indirect
+      if (BigR->changeElementType(SmallR->ElementTy))
+        return true;
+  // Then try the bigger element size.
+  if (!SmallR->Indirect) // small region not indirect
+    if (SmallR->changeElementType(BigR->ElementTy))
+      return true;
+  // Then try the smaller element size.
+  if (!PreferSmall)
+    if (!BigR->Indirect) // big region not indirect
+      if (BigR->changeElementType(SmallR->ElementTy))
+        return true;
+  return false;
+}
+
+/***********************************************************************
+ * combineRegions : combine two regions if possible
+ *
+ * Enter:   OuterR = Region struct for outer region
+ *          InnerR = Region struct for inner region
+ *          CombinedR = Region struct to write combined region into
+ *
+ * Return:  true if combining is possible
+ *
+ * If combining is possible, this function sets up CombinedR. However,
+ * CombinedR->Offset and CombinedR->Indirect are set assuming that the
+ * inner region is direct.
+ *
+ * If OuterR->ElementTy != InnerR->ElementTy, this algo cannot determine
+ * CombinedR->ElementTy, as the type depends on the order of respective
+ * wr/rd regions (it should be the type of the last one).
+ */
+bool GenXRegionCollapsing::combineRegions(const Region *OuterR,
+    const Region *InnerR, Region *CombinedR)
+{
+  LLVM_DEBUG(dbgs() << "GenXRegionCollapsing::combineRegions\n"
+      "  OuterR: " << *OuterR << "\n"
+      "  InnerR: " << *InnerR << "\n");
+  if (InnerR->Indirect && isa<VectorType>(InnerR->Indirect->getType()))
+    return false; // multi indirect not supported
+  if (OuterR->Indirect && isa<VectorType>(OuterR->Indirect->getType()))
+    return false; // multi indirect not supported
+  if (OuterR->Mask)
+    return false; // outer region predicated, cannot combine
+  *CombinedR = *InnerR;
+  CombinedR->Indirect = OuterR->Indirect;
+  CombinedR->Stride *= OuterR->Stride;
+  CombinedR->VStride *= OuterR->Stride;
+  unsigned ElOffset = InnerR->Offset / InnerR->ElementBytes;
+  if (OuterR->is2D()) {
+    // Outer region is 2D: create the combined offset. For outer 2D
+    // and inner indirect, what CombinedR->Offset is set to here is
+    // ignored and overwritten by calculateIndex(), so it does not matter
+    // that it is incorrect in that case.
+    ElOffset = ElOffset / OuterR->Width * OuterR->VStride
+        + ElOffset % OuterR->Width * OuterR->Stride;
+  } else {
+    // Outer region is 1D: create the combined offset. For the benefit
+    // of inner indirect, where InnerR->Offset is just an offset from
+    // InnerR->Indirect, we cope with InnerR->Offset being apparently
+    // out of range (negative or too big).
+    ElOffset *= OuterR->Stride;
+  }
+  CombinedR->Offset = OuterR->Offset + ElOffset * InnerR->ElementBytes;
+  if (!OuterR->is2D()) {
+    LLVM_DEBUG(dbgs() << "outer 1D: CombinedR: " << *CombinedR << "\n");
+    return true; // outer region is 1D, can always combine
+  }
+  if (InnerR->isScalar()) {
+    LLVM_DEBUG(dbgs() << "inner scalar/splat: CombinedR: " << *CombinedR << "\n");
+    return true; // inner region is scalar/splat, can always combine
+  }
+  if (InnerR->Indirect) {
+    // Indirect inner region. Can combine as long as inner vstride is a
+    // multiple of outer width, and it in turn is a multiple of inner parent
+    // width.
+    if (InnerR->ParentWidth && !(InnerR->VStride % (int)OuterR->Width)
+        && !(OuterR->Width % InnerR->ParentWidth)) {
+      CombinedR->VStride = InnerR->VStride / OuterR->Width * OuterR->VStride;
+      LLVM_DEBUG(dbgs() << "inner indirect: CombinedR: " << *CombinedR << "\n");
+      return true;
+    }
+    LLVM_DEBUG(dbgs() << "inner indirect: failed\n");
+    return false;
+  }
+  // Inner region is not indirect.
+  unsigned StartEl = InnerR->Offset / InnerR->ElementBytes;
+  unsigned StartRow = StartEl / OuterR->Width;
+  if (!InnerR->is2D()) {
+    // Inner region is 1D but outer region is 2D.
+    unsigned EndEl = StartEl + (InnerR->NumElements - 1) * InnerR->Stride;
+    unsigned EndRow = EndEl / OuterR->Width;
+    if (StartRow == EndRow) {
+      // The whole 1D inner region fits in a row of the outer region.
+      LLVM_DEBUG(dbgs() << "inner 1D outer 2D, fits in row: CombinedR: " << *CombinedR << "\n");
+      return true;
+    }
+    if (EndRow == StartRow + 1 && !(InnerR->NumElements % 2)) {
+      unsigned MidEl = StartEl + InnerR->NumElements / 2 * InnerR->Stride;
+      if (InnerR->Stride > 0 && (unsigned)(MidEl - (EndRow * OuterR->Width))
+            < (unsigned)InnerR->Stride) {
+        // The 1D inner region is evenly split between two adjacent rows of
+        // the outer region.
+        CombinedR->VStride = (MidEl % OuterR->Width - StartEl % OuterR->Width)
+            * OuterR->Stride + OuterR->VStride;
+        CombinedR->Width = InnerR->NumElements / 2;
+        LLVM_DEBUG(dbgs() << "inner 1D outer 2D, split between two rows: CombinedR: " << *CombinedR << "\n");
+        return true;
+      }
+    }
+    unsigned BeyondEndEl = EndEl + InnerR->Stride;
+    if (BeyondEndEl % OuterR->Width == StartEl % OuterR->Width
+        && !(OuterR->Width % InnerR->Stride)) {
+      // The 1D inner region is evenly split between N adjacent rows of the
+      // outer region, starting in the same column for each row.
+      CombinedR->Width = OuterR->Width / InnerR->Stride;
+      CombinedR->VStride = OuterR->VStride;
+      LLVM_DEBUG(dbgs() << "inner 1D outer 2D, split between N rows: CombinedR: " << *CombinedR << "\n");
+      return true;
+    }
+    LLVM_DEBUG(dbgs() << "inner 1D outer 2D, fail\n");
+    return false; // All other 1D inner region cases fail.
+  }
+  if (!(InnerR->VStride % (int)OuterR->Width)) {
+    // Inner vstride is a whole number of outer rows.
+    CombinedR->VStride = OuterR->VStride * InnerR->VStride / (int)OuterR->Width;
+    if (!InnerR->Indirect) {
+      // For a direct inner region, calculate whether we can combine.
+      unsigned StartEl = InnerR->Offset / InnerR->ElementBytes;
+      unsigned StartRow = StartEl / OuterR->Width;
+      unsigned EndRowOfFirstRow = (StartEl + (InnerR->Width - 1) * InnerR->Stride)
+            / OuterR->Width;
+      if (StartRow == EndRowOfFirstRow) {
+        // Each row of inner region is within a row of outer region, starting
+        // at the same column.
+        LLVM_DEBUG(dbgs() << "row within row: CombinedR: " << *CombinedR << "\n");
+        return true;
+      }
+    } else {
+      // For an indirect inner region, use parent width to tell whether we can
+      // combine.
+      if (InnerR->ParentWidth && !(OuterR->Width % InnerR->ParentWidth)) {
+        LLVM_DEBUG(dbgs() << "inner indirect, parentwidth ok: CombinedR: " << *CombinedR << "\n");
+        return true;
+      }
+    }
+  }
+  // We could handle other cases like:
+  //  - each row of inner region enclosed in a row of outer region
+  //    but with a different column offset
+  LLVM_DEBUG(dbgs() << "failed\n");
+  return false;
+}
+
+/***********************************************************************
+ * calculateIndex : calculate index in the case that the inner region is
+ *      indirect
+ *
+ * Enter:   OuterR, InnerR = outer and inner regions
+ *          CombinedR = combined region set up by combineRegions()
+ *          InnerIndex = variable index for inner region, including the
+ *              constant offset add that was extracted by the Region
+ *              constructor into InnerR->Offset
+ *          Name = name for new instruction(s)
+ *          InsertBefore = insert before this instruction
+ *          DL = debug loc for new instruction(s)
+ *
+ * This sets up CombinedR->Indirect and CombinedR->Offset.
+ *
+ * A Region has the offset set up as follows:
+ *
+ *  - For a direct region, R.Offset is the constant offset in bytes and
+ *    R.Indirect is 0.
+ *
+ *  - Normally, for an indirect region, R.Offset is 0 and R.Indirect is the
+ *    Value used for the offset (in bytes).
+ *
+ *  - But if the Value used for the offset is an add constant, then R.Offset
+ *    is the constant offset and R.Indirect is the other operand of the add.
+ *
+ * In some code paths, this function needs the actual index of the inner region,
+ * rather than the R.Offset and R.Indirect parts separated out by the Region
+ * constructor. Thus it is passed InnerIndex, which is that actual index value.
+ */
+void GenXRegionCollapsing::calculateIndex(const Region *OuterR,
+    const Region *InnerR, Region *CombinedR, Value *InnerIndex,
+    const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL)
+{
+  if (!OuterR->is2D()) {
+    // Outer region is 1D. We can leave CombinedR->Offset as
+    // set by combineRegions, but we need to add the indices together, scaling
+    // the inner one by the outer region's stride.
+    Value *Idx = InnerR->Indirect;
+    if (OuterR->Stride != 1) {
+      Idx = insertOp(Instruction::Mul, Idx, OuterR->Stride, Name,
+          InsertBefore, DL);
+      LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n");
+    }
+    if (OuterR->Indirect) {
+      Idx = insertOp(Instruction::Add, Idx, OuterR->Indirect, Name,
+          InsertBefore, DL);
+      LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n");
+    }
+    CombinedR->Indirect = Idx;
+    LLVM_DEBUG(dbgs() << " calculateIndex result(1d): CombinedR: " << *CombinedR << "\n");
+    return;
+  }
+  // Outer region is 2D. We need to split the inner region's index into row
+  // and column of the outer region, then recombine. We are using InnerIndex,
+  // which includes any constant offset add, so we need to adjust
+  // CombinedR->Offset so it does not include InnerR->Offset.
+  CombinedR->Offset = OuterR->Offset;
+  LLVM_DEBUG(dbgs() << " calculateIndex: Offset now " << CombinedR->Offset << "\n");
+  Value *Col = insertOp(Instruction::URem, InnerIndex,
+      OuterR->Width * OuterR->ElementBytes,
+      Name, InsertBefore, DL);
+  LLVM_DEBUG(dbgs() << " calculateIndex: " << *Col << "\n");
+  Value *Row = insertOp(Instruction::UDiv, InnerIndex,
+      OuterR->Width * OuterR->ElementBytes,
+      Name, InsertBefore, DL);
+  LLVM_DEBUG(dbgs() << " calculateIndex: " << *Row << "\n");
+  Value *Idx = nullptr;
+  if (!(OuterR->VStride % OuterR->Stride)) {
+    // We need to multply Row by VStride and Col by Stride. However, Stride
+    // divides VStride evenly, so we can common up the multiply by Stride.
+    Idx = insertOp(Instruction::Mul, Row,
+        OuterR->VStride * OuterR->ElementBytes / OuterR->Stride,
+        Name, InsertBefore, DL);
+    LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n");
+    Idx = insertOp(Instruction::Add, Idx, Col, Name, InsertBefore, DL);
+    LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n");
+    Idx = insertOp(Instruction::Mul, Idx, OuterR->Stride, Name, InsertBefore, DL);
+    LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n");
+  } else {
+    // Need to do Row*VStride and Col*Stride separately.
+    Idx = insertOp(Instruction::Mul, Row,
+        OuterR->VStride * OuterR->ElementBytes, Name, InsertBefore, DL);
+    LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n");
+    Col = insertOp(Instruction::Mul, Col, OuterR->Stride, Name, InsertBefore, DL);
+    LLVM_DEBUG(dbgs() << " calculateIndex: " << *Col << "\n");
+    Idx = insertOp(Instruction::Add, Idx, Col, Name, InsertBefore, DL);
+    LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n");
+  }
+  if (OuterR->Indirect) {
+    Idx = insertOp(Instruction::Add, Idx, OuterR->Indirect,
+        Name, InsertBefore, DL);
+    LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n");
+  }
+  CombinedR->Indirect = Idx;
+  LLVM_DEBUG(dbgs() << " calculateIndex result(2d): CombinedR: " << *CombinedR << "\n");
+}
+
+/***********************************************************************
+ * insertOp : insert a binary op
+ */
+Value *GenXRegionCollapsing::insertOp(Instruction::BinaryOps Opcode, Value *Lhs,
+    unsigned Rhs, const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL)
+{
+  auto I16Ty = Type::getInt16Ty(InsertBefore->getContext());
+  return insertOp(Opcode, Lhs,
+      Constant::getIntegerValue(I16Ty, APInt(16, Rhs)),
+      Name, InsertBefore, DL);
+}
+
+Value *GenXRegionCollapsing::insertOp(Instruction::BinaryOps Opcode, Value *Lhs,
+    Value *Rhs, const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL)
+{
+  if (auto C = dyn_cast<ConstantInt>(Rhs)) {
+    int RhsVal = C->getZExtValue();
+    int LogVal = genx::exactLog2(RhsVal);
+    if (LogVal >= 0) {
+      switch (Opcode) {
+        case Instruction::Mul:
+          // multiply by power of 2 -> shl
+          if (!LogVal)
+            return Lhs;
+          Rhs = Constant::getIntegerValue(C->getType(), APInt(16, LogVal));
+          Opcode = Instruction::Shl;
+          break;
+        case Instruction::UDiv:
+          // divide by power of 2 -> lshr
+          if (!LogVal)
+            return Lhs;
+          Rhs = Constant::getIntegerValue(C->getType(), APInt(16, LogVal));
+          Opcode = Instruction::LShr;
+          break;
+        case Instruction::URem:
+          // remainder by power of 2 -> and
+          Rhs = Constant::getIntegerValue(C->getType(), APInt(16, RhsVal - 1));
+          Opcode = Instruction::And;
+          break;
+        default:
+          break;
+      }
+    }
+  }
+  auto Inst = BinaryOperator::Create(Opcode, Lhs, Rhs, Name, InsertBefore);
+  Inst->setDebugLoc(DL);
+  return Inst;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRematerialization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRematerialization.cpp
new file mode 100644
index 000000000000..247442968078
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRematerialization.cpp
@@ -0,0 +1,146 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXRematerialization
+/// ---------------------
+///
+/// This pass performs rematerialization to reduce register pressure.
+///
+//===----------------------------------------------------------------------===//
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "GenXNumbering.h"
+#include "GenXPressureTracker.h"
+#include "GenXUtil.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+class GenXRematerialization : public FunctionGroupPass {
+  GenXBaling *Baling = nullptr;
+  GenXLiveness *Liveness = nullptr;
+  GenXNumbering *Numbering = nullptr;
+  bool Modified = false;
+
+public:
+  static char ID;
+  explicit GenXRematerialization() : FunctionGroupPass(ID) {}
+  StringRef getPassName() const override {
+    return "GenX rematerialization pass";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunctionGroup(FunctionGroup &FG) override;
+
+private:
+  void remat(Function *F, PressureTracker &RP);
+};
+
+} // namespace
+
+namespace llvm { void initializeGenXRematerializationPass(PassRegistry &); }
+char GenXRematerialization::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXRematerialization, "GenXRematerialization", "GenXRematerialization", false, false)
+INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_DEPENDENCY(GenXNumbering)
+INITIALIZE_PASS_END(GenXRematerialization, "GenXRematerialization", "GenXRematerialization", false, false)
+
+FunctionGroupPass *llvm::createGenXRematerializationPass() {
+  initializeGenXRematerializationPass(*PassRegistry::getPassRegistry());
+  return new GenXRematerialization;
+}
+
+void GenXRematerialization::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionGroupPass::getAnalysisUsage(AU);
+  AU.addRequired<GenXGroupBaling>();
+  AU.addRequired<GenXLiveness>();
+  AU.addRequired<GenXNumbering>();
+  AU.addPreserved<GenXModule>();
+  AU.addPreserved<FunctionGroupAnalysis>();
+  AU.setPreservesCFG();
+}
+
+bool GenXRematerialization::runOnFunctionGroup(FunctionGroup &FG) {
+  if (skipOptWithLargeBlock(FG))
+    return false;
+
+  Modified = false;
+  Baling = &getAnalysis<GenXGroupBaling>();
+  Liveness = &getAnalysis<GenXLiveness>();
+  Numbering = &getAnalysis<GenXNumbering>();
+  PressureTracker RP(FG, Liveness);
+  for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi)
+    remat(*fgi, RP);
+  return Modified;
+}
+
+void GenXRematerialization::remat(Function *F, PressureTracker &RP) {
+  // Collect rematerialization candidates.
+  std::vector<Use *> Candidates;
+  for (auto &BB : F->getBasicBlockList()) {
+    for (auto &Inst : BB.getInstList()) {
+      // (1) upward cast
+      if (auto CI = dyn_cast<CastInst>(&Inst)) {
+        if (CI->getOpcode() != Instruction::UIToFP &&
+            CI->getOpcode() != Instruction::SIToFP)
+          continue;
+        if (!CI->getType()->isVectorTy())
+          continue;
+        if (CI->getSrcTy()->getScalarSizeInBits() >=
+            CI->getDestTy()->getScalarSizeInBits())
+          continue;
+        if (Inst.isUsedOutsideOfBlock(&BB) || Inst.getNumUses() <= 2)
+          continue;
+        LiveRange *LR = Liveness->getLiveRangeOrNull(CI);
+        if (!LR || LR->value_size() != 1)
+          continue;
+        assert(*LR->value_begin() == CI);
+        unsigned B = Numbering->getNumber(CI);
+        for (auto &U : CI->uses()) {
+          auto UI = U.getUser();
+          unsigned E = Numbering->getNumber(UI);
+          if (E > B && RP.intersectWithRedRegion(B, E))
+            Candidates.push_back(&U);
+        }
+      }
+    }
+  }
+
+  // Do rematerialization.
+  for (auto U : Candidates) {
+    Instruction *Inst = cast<Instruction>(U->get());
+    Instruction *UI = cast<Instruction>(U->getUser());
+    Instruction *Clone = Inst->clone();
+    Clone->insertBefore(UI);
+    U->set(Clone);
+    Modified = true;
+  }
+}
\ No newline at end of file
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXSimdCFConformance.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSimdCFConformance.cpp
new file mode 100644
index 000000000000..6bd1ef7cf26b
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSimdCFConformance.cpp
@@ -0,0 +1,3698 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXSimdCFConformance
+/// ---------------------
+///
+/// This pass checks that the use of SIMD control flow (llvm.genx.simdcf.goto
+/// and llvm.genx.simdcf.join) conforms to the rules required to allow us to
+/// generate actual goto and join instructions. If not, the intrinsics are
+/// lowered to code that implements the defined semantics for the intrinsics,
+/// but does not use SIMD CF instructions, so is usually less efficient.
+///
+/// It also makes certain transformations to make goto/join legal in terms of its
+/// position in the basic block. These can fail silently, in which case the
+/// conformance check will fail on the goto/join in question:
+///
+/// * A goto and its extractvalues must be at the end of the block. (Actually, if
+///   the !any result of the goto is used in a conditional branch at the end of
+///   the block, then the goto being baled into the branch means that it is
+///   treated as being at the end of the block anyway. The only reason we need
+///   to sink it here is to ensure that isGotoBlock works.)
+///
+/// * For a join label block (a block that is the JIP of other gotos/joins), a
+///   join must come at the start of the block.
+///
+/// * For a branching join block (one whose conditional branch condition is the
+///   !any result from a join), the join must be at the end of the block.
+///
+/// * For a block that has one join with both of the above true, we need to move
+///   all other code out of the block.
+///
+/// The pass is run twice: an "early SIMD CF conformance pass" (a module pass)
+/// just before GenXLowering, and a "late SIMD CF conformance pass" (a function
+/// group pass) just before second baling.
+///
+/// The early pass is the one that checks for conformance, and lowers the goto
+/// and join intrinsics if the code is not conformant. The conformance checks
+/// implement the rules listed in the documentation for the goto and join
+/// intrinsics.
+///
+/// Lowering a goto issues a "failed to optimize SIMD control flow" warning. No
+/// clue is given in the warning as to what caused the conformance failure,
+/// however you (a compiler developer) can find out (for a test case submitted
+/// by a compiler user) by turning on -debug and looking at the output from this
+/// pass.
+///
+/// The late pass checks again for conformance, but if the code is not
+/// conformant, it just errors. We could lower the gotos and joins there too,
+/// but it would be more fiddly as we would have to ensure that the code
+/// conforms with what is expected at that stage of compilation, and there is
+/// no further chance to optimize it there.
+///
+/// We are not expecting this error to happen.
+///
+/// Otherwise, the late pass sets the register category of the EM and RM values
+/// to "EM" and "RM", so they do not get any register allocated.
+///
+/// Conformance rules
+/// ^^^^^^^^^^^^^^^^^
+/// 
+/// If the goto and join intrinsics are not used in a way that conforms to the
+/// rules, then they will still have the semantics in their spec, but this pass
+/// will lower at least some of them to equivalent but less efficient code.
+/// 
+/// The rules are:
+/// 
+/// 1. Because the hardware has a single EM (execution mask) register, all EM
+///    values input to and generated by these intrinsics must not interfere with
+///    each other; that is, they must have disjoint live ranges. For the
+///    purposes of determining interference, if any EM value is a phi node
+///    with incoming constant all ones, then the constant all ones value is
+///    counted as being live from the start of the function and is not allowed
+///    to interfere with other EM values (although it can interfere with other
+///    such constant all ones values).
+/// 
+/// 2. An EM value is allowed to be defined:
+/// 
+///    a. as part of the struct returned by one of these intrinsics;
+/// 
+///    b. by a phi node, as long as each incoming is either an EM value or
+///       a constant all ones;
+/// 
+///    c. by an extractvalue extracting it from a struct containing an EM value;
+/// 
+///    d. as a function argument, as long as an EM value is also returned by the
+///       function (perhaps as part of a struct);
+/// 
+///    e. by an insertvalue as part of a return value struct;
+/// 
+///    f. as the return value of a non-intrinsic call (perhaps as part of a struct),
+///       as long as there is also a call arg that is an EM value, and the called
+///       function has the corresponding function arg and return value as EM values;
+///
+///    g. since shufflevector from EM does not change EM and only makes it shorter
+///       to create implicit predication of desired width, it's also considered
+///       as an EM definition, but it can only be used by wrregion and select;
+/// 
+/// 3. An EM value is allowed to be used:
+/// 
+///    a. as the OldEM input to one of these intrinsics;
+/// 
+///    b. in a phi node, as long as the result of the phi node is an EM value;
+/// 
+///    c. as the condition in a wrregion or select;
+/// 
+///    d. as the input to a shufflevector whose effect is to slice part of the EM
+///       value starting at index 0, as long as the result of that slice is only
+///       used as the condition in a wrregion or select;
+/// 
+///    e. as a call argument, as long as the corresponding function argument is an
+///       EM value, and the call has an EM return value;
+/// 
+///    f. in a return (perhaps as part of a struct), as long as the function also
+///       has an argument that is an EM value.
+///  
+///    For an EM value defined in a goto, or a join whose scalar BranchCond result
+///    is used in a conditional branch, or in an extractvalue out of
+///    the result of such a goto or join, the only use allowed in the same basic block
+///    as the goto/join is such an extractvalue.
+/// 
+/// 4. The OldEM input to the two intrinsics must be either an EM value or
+///    constant all ones. In the latter case, and in the case of a constant incoming
+///    to an EM phi node, its live range is considered to reach
+///    back through all paths to the function entry for the purposes of rule (1).
+/// 
+/// 5. Each join point has a web of RM (resume mask) values, linked as by rules (6)
+///    and (7). All RM values within one join point's web must not interfere with
+///    each other; that is, they must have disjoint live ranges. For the
+///    purposes of determining interference, if an RM value is a phi node with
+///    incoming constant all zeros, then the constant all zeros value is
+///    counted as being live from the start of the function and is not allowed
+///    to interfere with other RM values for this join (although it can
+///    interfere with other such constant all zeros values).
+/// 
+/// 6. An RM value is allowed to be defined:
+/// 
+///    a. as part of the struct returned by ``llvm.genx.simdcf.goto``;
+/// 
+///    b. by a phi node, as long as each incoming is either an RM value or
+///       a constant all zeros.
+/// 
+/// 7. An RM value is allowed to be used:
+/// 
+///    a. as the OldRM input to ``llvm.genx.simdcf.goto``;
+/// 
+///    b. as the RM input to ``llvm.genx.simdcf.join``, but only to one join in the
+///       whole web;
+/// 
+///    c. in a phi node, as long as the result of the phi node is an RM value.
+/// 
+/// 8. The OldRM input to ``llvm.genx.simdcf.goto``, or the RM input to
+///    ``llvm.genx.simdcf.join``, must be either an RM value, or constant all
+///    zeros. In the latter case, and in the case of a constant incoming to an RM
+///    phi node, its live range is considered to reach back through all paths
+///    to the function entry or to the web's ``llvm.genx.simdcf.join`` for the
+///    purposes of rule (5).
+/// 
+/// 9. The BranchCond struct element of the result of ``llvm.genx.simdcf.goto``
+///    must either be unused (unextracted), or, after being extractvalued,
+///    must have exactly one use, which is in a
+///    conditional branch terminating the same basic block. In the unused case,
+///    the basic block must end with an unconditional branch. (This is a goto
+///    that is immediately followed by a join.)
+/// 
+/// 10. The BranchCond struct element of the result of ``llvm.genx.simdcf.join``
+///     must either be unused (unextracted), or, after being extractvalued,
+///     have exactly one use, which is in a conditional branch terminating the
+///     same basic block.
+/// 
+/// 11. It must be possible to derive an ordering for the basic blocks in a
+///     function such that, in the conditional branch using the result of any goto
+///     or join, the "false" successor is fall-through and the "true" successor is
+///     to a join later on in the sequence. For a goto followed by an
+///     unconditional branch, the successor is fall-through _and_ the next join
+///     in sequence.
+/// 
+/// **IR restriction**: goto and join intrinsics must conform to these rules
+/// (since this pass lowers any that do not).
+/// 
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_SIMDCFCONFORMANCE"
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXConstants.h"
+#include "GenXGotoJoin.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "vc/GenXOpts/Utils/RegCategory.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include "llvmWrapper/IR/InstrTypes.h"
+
+using namespace llvm;
+using namespace genx;
+
+static cl::opt<bool> EnableGenXGotoJoin("enable-genx-goto-join", cl::init(true), cl::Hidden,
+                                        cl::desc("Enable use of Gen goto/join instructions for SIMD control flow."));
+
+namespace {
+
+// Diagnostic information for error/warning relating to SIMD control flow.
+class DiagnosticInfoSimdCF : public DiagnosticInfoOptimizationBase {
+private:
+  static int KindID;
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+public:
+  static void emit(Instruction *Inst, StringRef Msg, DiagnosticSeverity Severity = DS_Error);
+  DiagnosticInfoSimdCF(DiagnosticSeverity Severity, const Function &Fn,
+      const DebugLoc &DLoc, StringRef Msg)
+      : DiagnosticInfoOptimizationBase((DiagnosticKind)getKindID(), Severity,
+          /*PassName=*/nullptr, Msg, Fn, DLoc) {}
+  // This kind of message is always enabled, and not affected by -rpass.
+  virtual bool isEnabled() const override { return true; }
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+
+  // TODO: consider changing format
+  void print(DiagnosticPrinter &DP) const override { DP << "GenXSimdCFConformance: " << RemarkName; }
+};
+int DiagnosticInfoSimdCF::KindID = 0;
+
+// GenX SIMD control flow conformance pass -- common data between early and
+// late passes.
+class GenXSimdCFConformance {
+protected:
+  Module *M;
+  FunctionGroup *FG;
+  FunctionGroupAnalysis *FGA;
+  DominatorTreeGroupWrapperPass *DTWrapper;
+  std::map<Function *, DominatorTree *> DTs;
+  GenXLiveness *Liveness;
+  bool Modified;
+  SetVector<SimpleValue> EMVals;
+  std::map<CallInst *, SetVector<SimpleValue>> RMVals;
+  bool lowerSimdCF;
+private:
+
+  // GotoJoinEVs: container for goto/join Extract Value (EV) info. Also
+  // allowes to remove duplication of EVs. Performs it in construction
+  // and moves EVs right after goto/join. Hoisting can be performed
+  // again with hoistEVs method. For instance, it is used on join
+  // hoisting to save correct EM liveranges.
+  class GotoJoinEVs {
+  private:
+    enum ValPos {
+      EMPos = 0,
+      RMPos = 1,
+      JoinCondPos = 1,
+      GotoCondPos = 2,
+      PosNum
+    };
+
+    ExtractValueInst *EVs[PosNum] = { nullptr, nullptr, nullptr };
+    bool IsGoto;
+    Value *GotoJoin;
+
+    void CollectEVs();
+
+  public:
+
+    GotoJoinEVs(Value *GJ = nullptr);
+    ExtractValueInst *getEMEV() const;
+    ExtractValueInst *getRMEV() const;
+    ExtractValueInst *getCondEV() const;
+    Value *getGotoJoin() const; 
+    Instruction *getSplitPoint() const;
+    void setCondEV(ExtractValueInst *CondEV);
+    bool isGoto() const;
+    bool isJoin() const;
+    void hoistEVs() const;
+
+  };
+
+  SetVector<SimpleValue> EMValsStack;
+  std::map<CallInst *, CallInst *> GotoJoinMap;
+  std::map<Value *, Value *> EMProducers;
+  std::map<Value *, GotoJoinEVs> GotoJoinEVsMap;
+protected:
+  GenXSimdCFConformance() : 
+	M(0), FG(0), FGA(0), DTWrapper(0), Liveness(0), lowerSimdCF(false) {}
+  void gatherEMVals();
+  void gatherRMVals();
+  void removeFromEMRMVals(Value *V);
+  void moveCodeInGotoBlocks(bool hoistGotoUsers = false);
+  void moveCodeInJoinBlocks();
+  void ensureConformance();
+  void lowerAllSimdCF();
+  void canonicalizeEM();
+  void splitGotoJoinBlocks();
+  void lowerUnsuitableGetEMs();
+  void clear() {
+    DTs.clear();
+    EMVals.clear();
+    RMVals.clear();
+    GotoJoinMap.clear();
+    GotoJoinEVsMap.clear();
+    EMProducers.clear();
+  }
+private:
+  bool isLatePass() { return FG != nullptr; }
+  void emptyBranchingJoinBlocksInFunc(Function *F);
+  void emptyBranchingJoinBlock(CallInst *Join);
+  DominatorTree *getDomTree(Function *F);
+  bool hoistJoin(CallInst *Join);
+  bool checkEMVal(SimpleValue EMVal);
+  bool checkGoto(SimpleValue EMVal);
+  bool checkJoin(SimpleValue EMVal);
+  bool checkGotoJoin(SimpleValue EMVal);
+  void removeBadEMVal(SimpleValue EMVal);
+  void pushValues(Value *V);
+  bool getConnectedVals(SimpleValue Val, int Cat, bool IncludeOptional, CallInst *OkJoin, SmallVectorImpl<SimpleValue> *ConnectedVals, bool LowerBadUsers = false);
+  void checkEMInterference();
+  void checkInterference(SetVector<SimpleValue> *Vals, SetVector<Value *> *BadDefs, Instruction *ConstStop);
+  bool hoistGotoUser(Instruction *Inst, CallInst *Goto, unsigned operandNo);
+  void gatherGotoJoinEMVals(bool IncludeIncoming = true);
+  void handleEVs();
+  void resolveBitCastChains();
+  Value *eliminateBitCastPreds(Value *Val, std::set<Value *> &DeadInst, std::set<Value *> &Visited);
+  Value *getEMProducer(Value *Inst, std::set<Value *> &Visited, bool BitCastAllowed = false);
+  void handleCondValue(Value *GotoJoin);
+  void handleNoCondEVCase(GotoJoinEVs &GotoJoinData);
+  void handleOptimizedBranchCase(GotoJoinEVs &GotoJoinData, BasicBlock *&TrueSucc, BasicBlock *&FalseSucc);
+  void handleExistingBranchCase(GotoJoinEVs &GotoJoinData, BasicBlock *&TrueSucc, BasicBlock *&FalseSucc, BranchInst *ExistingBranch);
+  void addNewPhisIncomings(BasicBlock *BranchingBlock, BasicBlock *TrueSucc, BasicBlock *FalseSucc);
+  void collectCondEVUsers(ExtractValueInst *CondEV, std::vector<Value *> &BadUsers, BranchInst *&CorrectUser);
+  void updateBadCondEVUsers(GotoJoinEVs &GotoJoinData, std::vector<Value *> &BadUsers, BasicBlock *TrueSucc, BasicBlock *FalseSucc);
+  Value *findGotoJoinVal(int Cat, BasicBlock *Loc, Instruction *CondEV, BasicBlockEdge &TrueEdge, BasicBlockEdge &FalseEdge, Value *TrueVal,
+    Value *FalseVal, std::map<BasicBlock *, Value *> &foundVals);
+  bool canUseLoweredEM(Instruction *Val);
+  void replaceUseWithLoweredEM(Instruction *Val, unsigned opNo, SetVector<Value *> &ToRemove);
+  Value *insertCond(Value *OldVal, Value *NewVal, const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL);
+  Value *truncateCond(Value *In, Type *Ty, const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL);
+  void lowerGoto(CallInst *Goto);
+  void lowerJoin(CallInst *Join);
+  void replaceGotoJoinUses(CallInst *GotoJoin, ArrayRef<Value *> Vals);
+};
+
+// GenX early SIMD control flow conformance pass
+class GenXEarlySimdCFConformance
+    : public GenXSimdCFConformance, public ModulePass {
+public:
+  static char ID;
+  explicit GenXEarlySimdCFConformance() : ModulePass(ID) { }
+  virtual StringRef getPassName() const { return "GenX early SIMD control flow conformance"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    ModulePass::getAnalysisUsage(AU);
+  }
+  bool runOnModule(Module &M);
+};
+
+// GenX late SIMD control flow conformance pass
+class GenXLateSimdCFConformance
+    : public GenXSimdCFConformance, public FunctionGroupPass {
+public:
+  static char ID;
+  explicit GenXLateSimdCFConformance() : FunctionGroupPass(ID) { }
+  virtual StringRef getPassName() const { return "GenX late SIMD control flow conformance"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    FunctionGroupPass::getAnalysisUsage(AU);
+    AU.addRequired<DominatorTreeGroupWrapperPass>();
+    AU.addRequired<GenXLiveness>();
+    AU.addPreserved<GenXModule>();
+    AU.addPreserved<GenXLiveness>();
+    AU.addPreserved<FunctionGroupAnalysis>();
+  }
+  bool runOnFunctionGroup(FunctionGroup &FG);
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const
+  { return createGenXGroupPrinterPass(O, Banner); }
+private:
+  void setCategories();
+  void modifyEMUses(Value *EM);
+};
+
+} // end anonymous namespace
+
+char GenXEarlySimdCFConformance::ID = 0;
+namespace llvm { void initializeGenXEarlySimdCFConformancePass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXEarlySimdCFConformance, "GenXEarlySimdCFConformance", "GenXEarlySimdCFConformance", false, false)
+INITIALIZE_PASS_END(GenXEarlySimdCFConformance, "GenXEarlySimdCFConformance", "GenXEarlySimdCFConformance", false, false)
+
+ModulePass *llvm::createGenXEarlySimdCFConformancePass()
+{
+  initializeGenXEarlySimdCFConformancePass(*PassRegistry::getPassRegistry());
+  return new GenXEarlySimdCFConformance();
+}
+
+char GenXLateSimdCFConformance::ID = 0;
+namespace llvm { void initializeGenXLateSimdCFConformancePass(PassRegistry &); }
+INITIALIZE_PASS_BEGIN(GenXLateSimdCFConformance, "GenXLateSimdCFConformance", "GenXLateSimdCFConformance", false, false)
+INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_DEPENDENCY(GenXModule)
+INITIALIZE_PASS_END(GenXLateSimdCFConformance, "GenXLateSimdCFConformance", "GenXLateSimdCFConformance", false, false)
+
+FunctionGroupPass *llvm::createGenXLateSimdCFConformancePass()
+{
+  initializeGenXLateSimdCFConformancePass(*PassRegistry::getPassRegistry());
+  return new GenXLateSimdCFConformance();
+}
+
+/***********************************************************************
+ * runOnModule : run the early SIMD control flow conformance pass for this
+ *  module
+ */
+bool GenXEarlySimdCFConformance::runOnModule(Module &ArgM)
+{
+  LLVM_DEBUG(dbgs() << "Early SIMD CF Conformance starts\n");
+
+  Modified = false;
+  M = &ArgM;
+  FG = nullptr;
+  FGA = nullptr;
+  DTWrapper = nullptr;
+  // Perform actions to create correct DF for EM
+  canonicalizeEM();
+  // Gather the EM values, both from goto/join and phi nodes.
+  gatherEMVals();
+  // Gather the RM values from gotos and phi nodes.
+  gatherRMVals();
+  // Hoist instructions that does not depend on Goto's result.
+  // It is needed to perform correct split.
+  moveCodeInGotoBlocks();
+  // Split Goto/Join blocks to recreate actual SIMD CF
+  splitGotoJoinBlocks();
+  // Handle instructions that depend on Goto's result
+  moveCodeInGotoBlocks(true);
+  // Handle Joins to create correct SIMD CF structure
+  moveCodeInJoinBlocks();
+  // TODO: currently all SIMD CF is lowered if there is
+  // an unmask construction in module. It is very suboptimal.
+  if (lowerSimdCF)
+    lowerAllSimdCF();
+  else
+    // Repeatedly check the code for conformance and lower non-conformant gotos
+    // and joins until the code stabilizes.
+    ensureConformance();
+  // Perform check for genx_simdcf_get_em intrinsics and remove redundant ones.
+  lowerUnsuitableGetEMs();
+  clear();
+
+  LLVM_DEBUG(dbgs() << "Early SIMD CF Conformance ends\n");
+
+  return Modified;
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run the late SIMD control flow conformance pass for this
+ * FunctionGroup
+ */
+bool GenXLateSimdCFConformance::runOnFunctionGroup(FunctionGroup &ArgFG)
+{
+  LLVM_DEBUG(dbgs() << "Late SIMD CF Conformance starts\n");
+
+  Modified = false;
+  FG = &ArgFG;
+  M = FG->getModule();
+  // Get analyses that we use and/or modify.
+  FGA = &getAnalysis<FunctionGroupAnalysis>();
+  DTWrapper = &getAnalysis<DominatorTreeGroupWrapperPass>();
+  Liveness = &getAnalysis<GenXLiveness>();
+  // Gather the EM values, both from goto/join and phi nodes.
+  gatherEMVals();
+  // Gather the RM values from gotos and phi nodes.
+  gatherRMVals();
+  // Move code in goto and join blocks as necessary.
+  moveCodeInGotoBlocks();
+  moveCodeInJoinBlocks();
+  // Check the code for conformance. In this late pass, we do not expect to
+  // find non-conformance.
+  ensureConformance();
+  // For remaining unlowered gotos and joins (the ones that will become SIMD
+  // control flow instructions), mark the webs of EM and RM values as
+  // category EM or RM respectively. For EM, this also modifies uses as needed.
+  setCategories();
+  clear();
+
+  LLVM_DEBUG(dbgs() << "Late SIMD CF Conformance ends\n");
+
+  return Modified;
+}
+
+/***********************************************************************
+ * gatherGotoJoinEMVals : gather the EM values for gotos/joins only
+ *
+ * IncludeIncoming is used for adding goto/join def to EMVals
+ */
+void GenXSimdCFConformance::gatherGotoJoinEMVals(bool IncludeIncoming)
+{
+  // We find gotos and joins by scanning all uses of the intrinsics and (in the
+  // case of the late pass) ignoring ones not in this function group, rather
+  // than scanning the whole IR.
+  Type *I1Ty = Type::getInt1Ty(M->getContext());
+  for (auto IID : { GenXIntrinsic::genx_simdcf_goto, GenXIntrinsic::genx_simdcf_join }) {
+    Type *EMTy = VectorType::get(I1Ty, 32);
+    for (unsigned Width = 1; Width <= 32; Width <<= 1) {
+      Type *Tys[] = { EMTy, VectorType::get(I1Ty, Width) };
+      auto GotoJoinFunc = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+      for (auto ui = GotoJoinFunc->use_begin(), ue = GotoJoinFunc->use_end();
+          ui != ue; ++ui) {
+        auto GotoJoin = dyn_cast<CallInst>(ui->getUser());
+        if (!GotoJoin)
+          continue;
+        if (FG && (FGA->getGroup(GotoJoin->getParent()->getParent()) != FG
+            || ui->getOperandNo() != GotoJoin->getNumArgOperands()))
+          continue;
+        // We have a goto/join (in our function group in the case of the late
+        // pass).  Add the EM value (struct index 0) to EMVals.
+        EMVals.insert(SimpleValue(GotoJoin, 0));
+        // Also add its EM input to EMVals, if not a constant.
+        if (IncludeIncoming && !isa<Constant>(GotoJoin->getOperand(0)))
+          EMVals.insert(SimpleValue(GotoJoin->getOperand(0), 0));
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * gatherEMVals : gather the EM values, including phi nodes
+ */
+void GenXSimdCFConformance::gatherEMVals()
+{
+  // Collect gotos/joins and their defs
+  gatherGotoJoinEMVals(true);
+
+  Type *I1Ty = Type::getInt1Ty(M->getContext());
+  Type *EMTy = VectorType::get(I1Ty, 32);
+  Type *Tys[] = { EMTy };
+  auto SavemaskFunc = GenXIntrinsic::getGenXDeclaration(
+      M, GenXIntrinsic::genx_simdcf_savemask, Tys);
+  for (auto ui = SavemaskFunc->use_begin(), ue = SavemaskFunc->use_end(); ui != ue;
+       ++ui) {
+    auto Savemask = dyn_cast<CallInst>(ui->getUser());
+    if (!Savemask)
+      continue;
+    if (FG && (FGA->getGroup(Savemask->getParent()->getParent()) != FG ||
+               ui->getOperandNo() != Savemask->getNumArgOperands()))
+      continue;
+	  lowerSimdCF = true;
+    // Add its EM input to EMVals, if not a constant.
+    if (!isa<Constant>(Savemask->getOperand(0)))
+      EMVals.insert(SimpleValue(Savemask->getOperand(0), 0));
+  }
+
+  auto UnmaskFunc = GenXIntrinsic::getGenXDeclaration(
+      M, GenXIntrinsic::genx_simdcf_unmask, Tys);
+  for (auto ui = UnmaskFunc->use_begin(), ue = UnmaskFunc->use_end(); ui != ue;
+       ++ui) {
+    auto Unmask = dyn_cast<CallInst>(ui->getUser());
+    if (!Unmask)
+      continue;
+    if (FG && (FGA->getGroup(Unmask->getParent()->getParent()) != FG ||
+               ui->getOperandNo() != Unmask->getNumArgOperands()))
+      continue;
+	  lowerSimdCF = true;
+    // We have a unmask (in our function group in the case of the late
+    EMVals.insert(SimpleValue(Unmask));
+  }
+  auto RemaskFunc = GenXIntrinsic::getGenXDeclaration(
+      M, GenXIntrinsic::genx_simdcf_remask, Tys);
+  for (auto ui = RemaskFunc->use_begin(), ue = RemaskFunc->use_end(); ui != ue;
+       ++ui) {
+    auto Remask = dyn_cast<CallInst>(ui->getUser());
+    if (!Remask)
+      continue;
+    if (FG && (FGA->getGroup(Remask->getParent()->getParent()) != FG ||
+               ui->getOperandNo() != Remask->getNumArgOperands()))
+      continue;
+	  lowerSimdCF = true;
+    // We have a remask (in our function group in the case of the late
+    // pass).  Add the EM value (struct index 0) to EMVals.
+    EMVals.insert(SimpleValue(Remask));
+    // Also add its EM input to EMVals, if not a constant.
+    if (!isa<Constant>(Remask->getOperand(0)))
+      EMVals.insert(SimpleValue(Remask->getOperand(0)));
+  }
+  // delete useless cm_unmask_begin and cm_unmask_end
+  auto UnmaskEF = GenXIntrinsic::getGenXDeclaration(
+         M, GenXIntrinsic::genx_unmask_end);
+  for (auto ui = UnmaskEF->use_begin(), ue = UnmaskEF->use_end(); ui != ue;) {
+    auto u = ui->getUser();
+    ++ui;
+    if (auto UnmaskEnd = dyn_cast<CallInst>(u))
+      UnmaskEnd->eraseFromParent();
+  }
+  auto UnmaskBF = GenXIntrinsic::getGenXDeclaration(
+    M, GenXIntrinsic::genx_unmask_begin);
+  for (auto ui = UnmaskBF->use_begin(), ue = UnmaskBF->use_end(); ui != ue;) {
+    auto u = ui->getUser();
+    ++ui;
+    if (auto UnmaskBeg = dyn_cast<CallInst>(u))
+      UnmaskBeg->eraseFromParent();
+  }
+  // Find related phi nodes and values related by insertvalue/extractvalue/call
+  // using EMVal as a worklist.
+  for (unsigned i = 0; i != EMVals.size(); ++i) {
+    SimpleValue EMVal = EMVals[i];
+    // For this EM value, get the connected values.
+    SmallVector<SimpleValue, 8> ConnectedVals;
+    getConnectedVals(EMVal, RegCategory::EM, /*IncludeOptional=*/true,
+        /*OkJoin=*/nullptr, &ConnectedVals);
+    // Add the connected values to EMVals.
+    for (auto j = ConnectedVals.begin(), je = ConnectedVals.end();
+        j != je; ++j)
+      if (!isa<Constant>(j->getValue()))
+        EMVals.insert(*j);
+  }
+}
+
+/***********************************************************************
+ * gatherRMVals : gather RM values for each join
+ */
+void GenXSimdCFConformance::gatherRMVals()
+{
+  for (auto ji = EMVals.begin(), je = EMVals.end(); ji != je; ++ji) {
+    auto EMVal = *ji;
+    if (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue()) != GenXIntrinsic::genx_simdcf_join)
+      continue;
+    auto Join = cast<CallInst>(EMVal.getValue());
+    // We have a join. Gather its web of RM values.
+    auto RMValsEntry = &RMVals[Join];
+    if (!isa<Constant>(Join->getOperand(1)))
+      RMValsEntry->insert(Join->getOperand(1));
+    for (unsigned rvi = 0; rvi != RMValsEntry->size(); ++rvi) {
+      SimpleValue RM = (*RMValsEntry)[rvi];
+      // RM is a value in this join's RM web. Get other values related by phi
+      // nodes and extractvalues and gotos.
+      SmallVector<SimpleValue, 8> ConnectedVals;
+      getConnectedVals(RM, RegCategory::RM, /*IncludeOptional=*/true,
+          Join, &ConnectedVals);
+      for (auto j = ConnectedVals.begin(), je = ConnectedVals.end();
+          j != je; ++j)
+        if (!isa<Constant>(j->getValue()))
+          RMValsEntry->insert(*j);
+    }
+  }
+}
+
+/***********************************************************************
+ * findGotoJoinVal : find goto/join that should be applied at the
+ * specified location
+ *
+ * It uses dominator tree to find the value needed. Category is used to
+ * set proper name for instruction and doesn't affect reg category
+ * that is used in reg alloc. It only shows what we are dealing with.
+ */
+Value *GenXSimdCFConformance::findGotoJoinVal(int Cat, BasicBlock *Loc, Instruction *GotoJoinEV,
+  BasicBlockEdge &TrueEdge, BasicBlockEdge &FalseEdge, Value *TrueVal, Value *FalseVal, std::map<BasicBlock *, Value *>& foundVals)
+{
+  assert(TrueEdge.getStart() == FalseEdge.getStart());
+  assert(TrueEdge.getEnd() != FalseEdge.getEnd());
+  assert((Cat == RegCategory::EM || Cat == RegCategory::PREDICATE) && "Handling only EM and Cond!");
+
+  LLVM_DEBUG(dbgs() << "Entering " << Loc->getName() << "\n");
+
+  // Check if value were found before
+  auto ResIt = foundVals.find(Loc);
+  if (ResIt != foundVals.end())
+    return ResIt->second;
+
+  DominatorTree *DomTree = getDomTree(Loc->getParent());
+  if (DomTree->dominates(TrueEdge, Loc)) {
+    LLVM_DEBUG(dbgs() << "Dominated by True Edge\n");
+    foundVals[Loc] = TrueVal;;
+    return TrueVal;
+  }
+  if (DomTree->dominates(FalseEdge, Loc)) {
+    LLVM_DEBUG(dbgs() << "Dominated by False Edge\n");
+    foundVals[Loc] = FalseVal;
+    return FalseVal;
+  }
+
+  // Need to create phi somewhere.
+  // Try to get IDom. If we found CondEV's BB then we are
+  // already in the final block
+  auto Node = DomTree->getNode(Loc);
+  auto IDom = Node->getIDom();
+  assert(IDom && "No IDom found!");
+  BasicBlock *PhiLoc = nullptr;
+  PhiLoc = IDom->getBlock();
+  if (IDom->getBlock() == GotoJoinEV->getParent())
+    PhiLoc = Loc;
+
+  std::string Name = (Cat == RegCategory::EM) ? "ExecMaskEV" : "CondEV";
+  auto PHI = PHINode::Create(GotoJoinEV->getType(), pred_size(PhiLoc), Name, &PhiLoc->front());
+  foundVals[PhiLoc] = PHI;
+  if (PhiLoc != Loc)
+    foundVals[Loc] = PHI;
+
+  for (auto pi = pred_begin(PhiLoc), pe = pred_end(PhiLoc); pi != pe; ++pi) {
+    BasicBlock *Pred = *pi;
+    Value *Val = nullptr;
+
+    // Don't check dominators for def since we are looking for
+    // edges that are located after it
+    if (Pred == TrueEdge.getStart()) {
+      // This happens when we enter def block from join block
+      // w/o any intermediate blocks (actually we expect this
+      // situation to happen always). Check that we came through
+      // true branch.
+      if (Pred->getTerminator()->getSuccessor(0) == PhiLoc) {
+        Val = TrueVal;
+        LLVM_DEBUG(dbgs() << "Usual case\n");
+      } else {
+        // This situation shouldn't happen, but if so, we can handle it
+        Val = FalseVal;
+        LLVM_DEBUG(dbgs() << "Strange case\n");
+      }
+    } else {
+      Val = findGotoJoinVal(Cat, Pred, GotoJoinEV, TrueEdge, FalseEdge, TrueVal, FalseVal, foundVals);
+    }
+
+    PHI->addIncoming(Val, Pred);
+  }
+
+  LLVM_DEBUG(dbgs() << "Built PHI for EV:" << *PHI << "\n");
+  return PHI;
+}
+
+/**
+ * collectCondEVUsers : gather Cond EV users
+ *
+ * Bad users: they should not use cond EV.
+ * Correct user: conditional branch CondEV's BB. This
+ * is the only possible conformant user.
+ */
+void GenXSimdCFConformance::collectCondEVUsers(ExtractValueInst *CondEV, std::vector<Value *> &BadUsers, BranchInst *&CorrectUser)
+{
+  // Bad users: they should not use cond EV. Make a real value for them
+  // Correct user: conditional branch in this BB
+  for (auto ui = CondEV->use_begin(), ue = CondEV->use_end(); ui != ue; ++ui) {
+    BranchInst *Br = dyn_cast<BranchInst>(ui->getUser());
+
+    // If cond EV is used by wrong branch, we can simply consider
+    // it as non-baled conditional branch
+    if (!Br || Br->getParent() != CondEV->getParent()) {
+      LLVM_DEBUG(dbgs() << "Found bad CondEV user:\n" << *ui->getUser() << "\n");
+      BadUsers.push_back(ui->getUser());
+    } else if (Br) {
+      assert(!CorrectUser && "Found another correct user!");
+      LLVM_DEBUG(dbgs() << "Found correct user:\n" << *Br << "\n");
+      CorrectUser = Br;
+    }
+  }
+}
+
+/**
+ * updateBadCondEVUsers : update bad cond EV users
+ *
+ * It replaces cond EV uses by values that can be
+ * obtained on true and false pathes
+ */
+void GenXSimdCFConformance::updateBadCondEVUsers(GenXSimdCFConformance::GotoJoinEVs &GotoJoinData,
+  std::vector<Value *> &BadUsers, BasicBlock *TrueSucc, BasicBlock *FalseSucc)
+{
+  ExtractValueInst *CondEV = GotoJoinData.getCondEV();
+  assert(CondEV && "Expected valid CondEV!");
+
+  BasicBlockEdge TrueEdge(CondEV->getParent(), TrueSucc);
+  BasicBlockEdge FalseEdge(CondEV->getParent(), FalseSucc);
+  Constant *TrueVal = Constant::getAllOnesValue(CondEV->getType());
+  Constant *FalseVal = Constant::getNullValue(CondEV->getType());
+
+  // Update users
+  std::map<BasicBlock *, Value *> FoundCondEV;
+  for (auto bi = BadUsers.begin(), be = BadUsers.end(); bi != be; ++bi) {
+    Instruction *User = cast<Instruction>(*bi);
+    for (unsigned idx = 0, opNum = User->getNumOperands(); idx < opNum; ++idx) {
+      if (CondEV != User->getOperand(idx))
+        continue;
+
+      User->setOperand(idx, findGotoJoinVal(RegCategory::PREDICATE, User->getParent(), CondEV, TrueEdge, FalseEdge, TrueVal, FalseVal, FoundCondEV));
+    }
+  }
+}
+
+/**
+ * addNewPhisIncomings : add new incomings after split
+ *
+ * It is needed to update phis after turning unconditional
+ * branch into conditional one. True successor is assumed to
+ * be correct join point, but the only thing we know here
+ * is that FalseSucc branches to TrueSucc. Branching Block's
+ * successors are TrueSucc and FalseSucc.
+ */
+void GenXSimdCFConformance::addNewPhisIncomings(BasicBlock *BranchingBlock, BasicBlock *TrueSucc, BasicBlock *FalseSucc)
+{
+  for (auto Inst = &TrueSucc->front();
+    auto PN = dyn_cast<PHINode>(Inst);
+    Inst = Inst->getNextNode()) {
+    Value* CurrVal = PN->getIncomingValueForBlock(BranchingBlock);
+    PN->addIncoming(CurrVal, FalseSucc);
+  }
+}
+
+/**
+ * handleNoCondEVCase : handle case when there is no
+ * CondEV for goto/join.
+ *
+ * It performs split for goto in order to prepare
+ * goto for possible EM lower. Goto is branch itself
+ * so such transformation doesn't introduce any
+ * overhead in case of conformant SIMD CF.
+ *
+ * TODO: this transformation can be reverted in case of
+ * non-conformant SIMD CF if necessary data was saved.
+ * It is not done now because no non-conformant cases
+ * were found so far.
+ */
+void GenXSimdCFConformance::handleNoCondEVCase(GenXSimdCFConformance::GotoJoinEVs &GotoJoinData)
+{
+  assert(!GotoJoinData.getCondEV() && "Unexpected CondEV!");
+
+  // Handle only goto
+  if (GotoJoinData.isJoin())
+    return;
+  auto SplitPoint = GotoJoinData.getSplitPoint();
+
+  // Skip possible goto users
+  for (;; SplitPoint = SplitPoint->getNextNode()) {
+    if (SplitPoint->isTerminator())
+      break;
+    if (auto CI = dyn_cast<CallInst>(SplitPoint)) {
+      // We need to perform split before next goto/join to save their conformance
+      if (GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_simdcf_goto ||
+        GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_simdcf_join)
+        break;
+    }
+  }
+
+  Value *GotoJoin = GotoJoinData.getGotoJoin();
+  ExtractValueInst *CondEV = ExtractValueInst::Create(GotoJoin, { 2 }, "missing_extractcond", SplitPoint);
+  GotoJoinData.setCondEV(CondEV);
+
+  if (auto Br = dyn_cast<BranchInst>(SplitPoint)) {
+    if (Br->isConditional()) {
+      // This CF is non-conformant: there should be a join point
+      // before this branch, but it wasn't found. Skip it.
+      return;
+    }
+
+    // We are turning unconditional branch into conditional one
+    BasicBlock *Split = BasicBlock::Create(CondEV->getContext(), "goto_split", CondEV->getParent()->getParent(), Br->getSuccessor(0));
+    BranchInst::Create(Br->getSuccessor(0), Split);
+    BranchInst::Create(Br->getSuccessor(0), Split, CondEV, Br);
+
+    // Update phis in TrueSucc
+    addNewPhisIncomings(CondEV->getParent(), Br->getSuccessor(0), Split);
+
+    Br->eraseFromParent();
+  } else {
+    // Split point is in the middle of BB. We assume that there is a join point
+    // after it.
+    // TODO: consider adding this check. No such cases were found now.
+    BasicBlock *TrueSucc = CondEV->getParent()->splitBasicBlock(SplitPoint, "cond_ev_true_split");
+    CondEV->getParent()->getTerminator()->eraseFromParent();
+    LLVM_DEBUG(dbgs() << "Created " << TrueSucc->getName() << " to handle missing conditional branch\n");
+
+    // False block: need to create new one
+    BasicBlock *FalseSucc = BasicBlock::Create(CondEV->getContext(), "cond_ev_false_split", CondEV->getParent()->getParent(),
+      TrueSucc);
+    LLVM_DEBUG(dbgs() << "Created " << FalseSucc->getName() << " to handle missing conditional branch\n");
+
+    // Link blocks
+    BranchInst::Create(TrueSucc, FalseSucc, CondEV, CondEV->getParent());
+    BranchInst::Create(TrueSucc, FalseSucc);
+  }
+
+  // CFG changed: update DomTree.
+  // TODO: there must be workaround to do it in a more optimal way
+  DominatorTree *domTree = getDomTree(CondEV->getParent()->getParent());
+  domTree->recalculate(*CondEV->getParent()->getParent());
+}
+
+/**
+ * handleOptimizedBranchCase : perform split for optimized branch case
+ *
+ * TODO: this make sence only in case when the true successor is a
+ * join block, otherwise it will introduce more overhead due to
+ * goto/join lowering. Also there should be check that this
+ * join really uses current EM and RM. This issue is resolved
+ * at the end of this pass in EM/RM liveness analysis and cannot
+ * be done easy at this point. For now assume that everything OK
+ * with it here.
+ *
+ * TODO: It is possible to undo this transformation if we store
+ * all necessery data here. Currently it is not done:
+ * no non-conformant cases found for now.
+ *
+ * Due to earlier transformations we can split BB after the last
+ * goto/join EV. It will solve issue with join located in this
+ * basic block. Code movements to sink goto/join will be performed
+ * further, we don't need to focus on it here.
+ */
+void GenXSimdCFConformance::handleOptimizedBranchCase(GenXSimdCFConformance::GotoJoinEVs &GotoJoinData, BasicBlock *&TrueSucc, BasicBlock *&FalseSucc)
+{
+  // Look for the first non-goto/join user inst
+  auto SplitPoint = GotoJoinData.getSplitPoint();
+
+  ExtractValueInst *CondEV = GotoJoinData.getCondEV();
+  assert(CondEV && "Expected valid CondEV!");
+
+  // Split: this is true succ which is join point (at least we assume that)
+  TrueSucc = CondEV->getParent()->splitBasicBlock(SplitPoint, "cond_ev_true_split");
+  LLVM_DEBUG(dbgs() << "Created " << TrueSucc->getName() << " to handle missing conditional branch\n");
+  CondEV->getParent()->getTerminator()->eraseFromParent();
+  // False block: need to create new one
+  FalseSucc = BasicBlock::Create(CondEV->getContext(), "cond_ev_false_split", CondEV->getParent()->getParent(),
+    TrueSucc);
+  LLVM_DEBUG(dbgs() << "Created " << FalseSucc->getName() << " to handle missing conditional branch\n");
+  // Link blocks
+  BranchInst::Create(TrueSucc, FalseSucc, CondEV, CondEV->getParent());
+  BranchInst::Create(TrueSucc, FalseSucc);
+
+  // CFG changed: update DomTree.
+  // TODO: there must be workaround to do it in a more optimal way
+  DominatorTree *domTree = getDomTree(CondEV->getParent()->getParent());
+  domTree->recalculate(*CondEV->getParent()->getParent());
+}
+
+/**
+ * handleExistingBranchCase : perform actions needed to
+ * handle case when branch wasn't optimized
+ *
+ * It stores True/False successors and adds new BB
+ * in case when both successors are the same BB.
+ */
+void GenXSimdCFConformance::handleExistingBranchCase(GenXSimdCFConformance::GotoJoinEVs &GotoJoinData,
+  BasicBlock *&TrueSucc, BasicBlock *&FalseSucc, BranchInst *ExistingBranch)
+{
+  ExtractValueInst *CondEV = GotoJoinData.getCondEV();
+  assert(CondEV && "Expected valid CondEV!");
+  assert(ExistingBranch->isConditional() && "Expected conditional branch!");
+
+  TrueSucc = ExistingBranch->getSuccessor(0);
+  FalseSucc = ExistingBranch->getSuccessor(1);
+
+  if (TrueSucc == FalseSucc) {
+    // We need to simply introduce new BB to get CondEV
+    FalseSucc = BasicBlock::Create(CondEV->getContext(), "cond_ev_split", CondEV->getParent()->getParent(),
+      TrueSucc);
+    BranchInst::Create(TrueSucc, FalseSucc);
+    ExistingBranch->setSuccessor(1, FalseSucc);
+
+    LLVM_DEBUG(dbgs() << "Created " << FalseSucc->getName() << " to handle always taken CONDITIONAL branch\n");
+
+    // Update phis in TrueSucc
+    addNewPhisIncomings(CondEV->getParent(), TrueSucc, FalseSucc);
+
+    // CFG changed: update DomTree.
+    // TODO: there must be workaround to do it in a more optimal way
+    DominatorTree *domTree = getDomTree(CondEV->getParent()->getParent());
+    domTree->recalculate(*CondEV->getParent()->getParent());
+  }
+}
+
+/**
+ * handleCondValue : perform analysis on Cond EV usage and fix
+ * it if needed
+ *
+ * The basic use case is optimized False Successor. That
+ * often happens in standard SimplifyCFG pass.
+ */
+void GenXSimdCFConformance::handleCondValue(Value *GotoJoin)
+{
+  GotoJoinEVs &GotoJoinData = GotoJoinEVsMap[GotoJoin];
+  ExtractValueInst *CondEV = GotoJoinData.getCondEV();
+
+  // No cond EV - nothing to handle. Here we create branch for goto
+  // to make it easier to handle possible bad EM users. Goto is a
+  // branch itself and it won't introduce any overhead in case
+  // of conformant SIMD CF
+  if (!CondEV) {
+    handleNoCondEVCase(GotoJoinData);
+    return;
+  }
+
+  // Collect Cond EV users
+  std::vector<Value *> BadUsers;
+  BranchInst *CorrectUser = nullptr;
+  collectCondEVUsers(CondEV, BadUsers, CorrectUser);
+
+  // Nothing needs to be fixed. However, allow this algorithm to fix
+  // case with TrueSucc == FalseSucc for goto in order to simplify further
+  // analysis.
+  if (BadUsers.empty() && GotoJoinData.isJoin())
+    return;
+
+  BasicBlock *TrueSucc = nullptr;
+  BasicBlock *FalseSucc = nullptr;
+
+  if (!CorrectUser) {
+    // Branch was optimized by some pass. We need to create it again.
+    handleOptimizedBranchCase(GotoJoinData, TrueSucc, FalseSucc);
+  } else {
+    // Branch is still here. Perform actions needed.
+    handleExistingBranchCase(GotoJoinData, TrueSucc, FalseSucc, CorrectUser);
+  }
+
+  // Update users
+  updateBadCondEVUsers(GotoJoinData, BadUsers, TrueSucc, FalseSucc);
+}
+
+/***********************************************************************
+ * splitGotoJoinBlocks : split Basic Blocks that contains goto/join
+ *
+ * This is used to solve problems that can be introduced by some
+ * standard LLVM passes: one of them is simplified CFG that lead to
+ * goto/join's condition usage by non-branch instruction. After this
+ * transformation each BB will contain only one goto or join instruction
+ * (or none of them), that fact allows us to make further changes simplier.
+ */
+void GenXSimdCFConformance::splitGotoJoinBlocks() {
+
+  LLVM_DEBUG(dbgs() << "Splitting GotoJoin Blocks\n");
+
+  for (auto &Elem : GotoJoinEVsMap) {
+
+    Value *GotoJoin = Elem.first;
+    auto &GotoJoinData = Elem.second;
+
+    LLVM_DEBUG(dbgs() << "Trying to split BB for:\n" << *GotoJoin << "\n");
+
+    handleCondValue(GotoJoin);
+
+    if (GotoJoinData.isJoin()) {
+      auto SplitPoint = GotoJoinData.getSplitPoint();
+      if (SplitPoint->isTerminator())
+        continue;
+      SplitPoint->getParent()->splitBasicBlock(SplitPoint, "split_for_join");
+      // CFG changed: update DomTree.
+      // TODO: there must be workaround to do it in a more optimal way
+      DominatorTree *domTree = getDomTree(SplitPoint->getParent()->getParent());
+      domTree->recalculate(*SplitPoint->getParent()->getParent());
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Done splitting\n\n" << *M << "\n\n");
+}
+
+/***********************************************************************
+ * removeFromEMRMVals : remove a value from EMVals or RMVals
+ *
+ * This is used just before erasing a phi node in moveCodeInJoinBlocks.
+ */
+void GenXSimdCFConformance::removeFromEMRMVals(Value *V)
+{
+  auto VT = dyn_cast<VectorType>(V->getType());
+  if (!VT || !VT->getElementType()->isIntegerTy(1))
+    return;
+  if (EMVals.remove(SimpleValue(V, 0)))
+    return;
+  for (auto i = RMVals.begin(), e = RMVals.end(); i != e; ++i) {
+    auto RMValsEntry = &i->second;
+    if (RMValsEntry->remove(SimpleValue(V, 0)))
+      return;
+  }
+}
+
+/***********************************************************************
+ * hoistGotoUser : hoist instruction that uses goto's EV and is located
+ * after it in the same basic block.
+ *
+ * Since goto must be at the end of basic block, we have to solve
+ * this problem somehow. Current approach is to duplicate instruction
+ * on both paths (true and false) and update uses.
+ *
+ * It is always possible to perform such transformation even if there
+ * is a chain of users: we just can duplicate them all. Since we know
+ * all values on the true pass, it should be possible to perform full
+ * calculation in this case. However, it is not done now because it can
+ * lead to much worse code when SIMD CF is not conformant (we are not
+ * sure that it is conformant at this point).
+ */
+bool GenXSimdCFConformance::hoistGotoUser(Instruction *Inst, CallInst *Goto, unsigned operandNo)
+{
+  // Find branch for goto
+  ExtractValueInst *CondEV = GotoJoinEVsMap[Goto].getCondEV();
+  auto BrIt = std::find_if(CondEV->use_begin(), CondEV->use_end(),
+    [&Goto](const Use& u) {
+      auto Br = dyn_cast<BranchInst>(u.getUser());
+      return (Br && Br->getParent() == Goto->getParent() && Br->isConditional());
+    });
+  assert(BrIt != CondEV->use_end() && "All gotos should become branching earlier!");
+
+  BranchInst *Br = cast<BranchInst>(BrIt->getUser());
+  BasicBlock *TrueSucc = Br->getSuccessor(0);
+  BasicBlock *FalseSucc = Br->getSuccessor(1);
+
+  // Handle FallThrough block with phis.
+  //
+  // TODO: it is redundant in some cases. For example, there can be Phi that
+  // uses bitcasts from EM from two paths. In this case we can use one
+  // GetEM from Phi with EM. Currently there is no trivial mechanism
+  // to check for that because in this case Phi arguments are supposed to use
+  // different Exectution Masks according to DF.
+  //
+  // Temporary solution for that is to place a splitter block that branches to
+  // such bb directly. Examples of that case can be found in local-atomics
+  // tests in ISPC.
+  if (isa<PHINode>(&FalseSucc->front())) {
+    BasicBlock *Splitter = BasicBlock::Create(FalseSucc->getContext(), "phi_fallthrough_splitter", FalseSucc->getParent());
+    Splitter->moveAfter(Goto->getParent());
+    BranchInst::Create(FalseSucc, Splitter);
+    Br->setSuccessor(1, Splitter);
+    // Update phis
+    for (auto CurrInst = &FalseSucc->front();
+         auto PN = dyn_cast<PHINode>(CurrInst);
+         CurrInst = CurrInst->getNextNode()) {
+      for (unsigned idx = 0, num = PN->getNumIncomingValues(); idx < num; ++idx) {
+        if (PN->getIncomingBlock(idx) == Goto->getParent())
+          PN->setIncomingBlock(idx, Splitter);
+      }
+    }
+    FalseSucc = Splitter;
+    // CFG changed: update DomTree.
+    // TODO: there must be workaround to do it in a more optimal way
+    DominatorTree *domTree = getDomTree(CondEV->getParent()->getParent());
+    domTree->recalculate(*CondEV->getParent()->getParent());
+  }
+
+  // Copy instruction and set the value for true block. Place it before goto.
+  Instruction *TrueVal = Inst->clone();
+  TrueVal->insertBefore(Goto);
+  TrueVal->setOperand(operandNo, Constant::getNullValue(Inst->getOperand(operandNo)->getType()));
+
+  // Copy instruction and place it in the false successor. Get EM will be
+  // created later to handle its goto use.
+  Instruction *FalseVal = Inst->clone();
+  FalseVal->insertBefore(FalseSucc->getFirstNonPHI());
+
+  // Handle all users
+  BasicBlockEdge TrueEdge(Goto->getParent(), TrueSucc);
+  BasicBlockEdge FalseEdge(Goto->getParent(), FalseSucc);
+  std::map<BasicBlock *, Value *> foundVals;
+  std::vector<Value *> newOperands;
+  for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+    auto User = dyn_cast<Instruction>(ui->getUser());
+    // TODO: it can be solved with duplicated instructions.
+    // Currently we are not going to duplicate them.
+    if (User->getParent() == Inst->getParent()) {
+      TrueVal->eraseFromParent();
+      FalseVal->eraseFromParent();
+      return false;
+    }
+
+    BasicBlock *Loc = User->getParent();
+    if (auto PN = dyn_cast<PHINode>(User))
+      Loc = PN->getIncomingBlock(ui->getOperandNo());
+
+    // Store new value
+    Value *NewOperand = nullptr;
+    if (Loc == Goto->getParent())
+      NewOperand = TrueVal;
+    else
+      NewOperand = findGotoJoinVal(RegCategory::EM, Loc, Inst, TrueEdge, FalseEdge,
+        TrueVal, FalseVal, foundVals);
+
+    newOperands.push_back(NewOperand);
+  }
+
+  // Update uses
+  unsigned i = 0;
+  for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue;) {
+    auto User = dyn_cast<Instruction>(ui->getUser());
+    unsigned opNo = ui->getOperandNo();
+    ++ui;
+    User->setOperand(opNo, newOperands[i++]);
+  }
+
+  return true;
+}
+
+/***********************************************************************
+ * moveCodeInGotoBlocks : move code in goto blocks
+ *
+ * A goto and its extractvalues must be at the end of the block. (Actually, if
+ * the !any result of the goto is used in a conditional branch at the end of
+ * the block, then the goto being baled into the branch means that it is
+ * treated as being at the end of the block anyway. The only reason we need to
+ * sink it here is to ensure that isGotoBlock works.)
+ *
+ * This can silently fail to sink a goto, in which case checkGoto will spot that
+ * the goto is not conformant.
+ */
+void GenXSimdCFConformance::moveCodeInGotoBlocks(bool hoistGotoUsers)
+{
+  for (auto gi = EMVals.begin(), ge = EMVals.end(); gi != ge; ++gi) {
+    auto EMVal = *gi;
+    if (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue()) != GenXIntrinsic::genx_simdcf_goto)
+      continue;
+    auto Goto = cast<CallInst>(EMVal.getValue());
+    // We want to sink the goto and its extracts. In fact we hoist any other
+    // instruction, checking that it does not use the extracts.
+    // With hoistGotoUsers, we are trying to hoist them, too.
+    // We are skipping all instructions that use skipped instructions
+    // in order to save dominance.
+    std::set<Instruction *> Skipping;
+    for (Instruction *NextInst = Goto->getNextNode();;) {
+      auto Inst = NextInst;
+      if (Inst->isTerminator())
+        break;
+      assert(Inst);
+      NextInst = Inst->getNextNode();
+      if (auto Extract = dyn_cast<ExtractValueInst>(Inst))
+        if (Extract->getOperand(0) == Goto)
+          continue;
+      bool Failed = false;
+      for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) {
+        if (auto I = dyn_cast<Instruction>(Inst->getOperand(oi)))
+          if (Skipping.count(I)) {
+            LLVM_DEBUG(dbgs() << "Skipping " << Inst->getName() << " due to use of skipped inst\n");
+            Skipping.insert(Inst);
+            Failed = true;
+            break;
+          }
+        if (auto Extract = dyn_cast<ExtractValueInst>(Inst->getOperand(oi)))
+          if (Extract->getOperand(0) == Goto) {
+            // This is used after splitting basic blocks.
+            // To perform this all gotos must be branching since EM
+            // is changed by goto.
+            if (hoistGotoUsers && hoistGotoUser(Inst, Goto, oi)) {
+              continue;
+            }
+            LLVM_DEBUG(dbgs() << "moveCodeInGotoBlocks: " << Goto->getName() << " failed\n");
+            LLVM_DEBUG(dbgs() << "Could not hoist " << Inst->getName() << "\n");
+            Failed = true;
+            Skipping.insert(Inst);
+            break; // Intervening instruction uses extract of goto; abandon
+          }
+      }
+      if (Failed)
+        continue;
+      if (Inst->getNumUses() == 0) {
+        Inst->eraseFromParent();
+        continue;
+      }
+      // Hoist the instruction.
+      Inst->removeFromParent();
+      Inst->insertBefore(Goto);
+    }
+  }
+}
+
+/***********************************************************************
+ * moveCodeInJoinBlocks : move code in join blocks as necessary
+ *
+ * 1. For a join label block (a block that is the JIP of other gotos/joins), a
+ *    join must come at the start of the block.
+ *
+ * 2. For a branching join block (one whose conditional branch condition is the
+ *    !any result from a join), the join must be at the end of the block.
+ *
+ * 3. For a block that has one join with both of the above true, we need to move
+ *    all other code out of the block.
+ *
+ * We achieve this as follows:
+ *
+ * a. First handle case 3. For any such block, hoist any other code to the end
+ *    of its immediate dominator. To allow for the immediate dominator also
+ *    being a case 3 join, we process blocks in post-order depth first search
+ *    order, so we visit a block before its dominator. Thus code from a case 3
+ *    join block eventually gets moved up to its closest dominating block that
+ *    is not a case 3 join block.
+ *
+ *    Because it is more convenient and does not hurt, we also hoist the code
+ *    before the first join in a block that initially looks like it is case 3,
+ *    even if it then turns out not to be a case 3 join because it has multiple
+ *    joins.
+ *
+ * b. Then scan all joins handling case 1.
+ *
+ * c. No need to handle case 2 here, as it (together with a similar requirement
+ *    to sink a goto in a branching goto block) is checked in checkConformance
+ *    and treated as sunk subsequently by virtue of getting baled in to the
+ *    branch.
+ *
+ * This happens in both SIMD CF conformance passes, in case constant loading
+ * etc sneaks code back into the wrong place in a join block. Any pass after
+ * the late SIMD CF conformance pass needs to be careful not to sneak code back
+ * into a join block.
+ *
+ * Any failure to do the above is not flagged here, but it will be spotted when
+ * checking the join for conformance.
+ *
+ * moveCodeInGotoBlocks needs to run first, as we rely on its sinking of an
+ * unconditional branch goto for isBranchingGotoJoinBlock to work.
+ */
+void GenXSimdCFConformance::moveCodeInJoinBlocks()
+{
+  // a. Handle case 3 join blocks.
+  if (!FG) {
+    // Early pass: iterate all funcs in the module.
+    for (auto mi = M->begin(), me = M->end(); mi != me; ++mi) {
+      Function *F = &*mi;
+      if (!F->empty()) 
+        emptyBranchingJoinBlocksInFunc(F);
+    }
+  } else {
+    // Late pass: iterate all funcs in the function group.
+    for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) {
+      Function *F = *fgi;
+      emptyBranchingJoinBlocksInFunc(F);
+    }
+  }
+  // b. Process all other joins (in fact all joins, but ones successfully
+  // processed above will not need anything doing).
+  // Get the joins into a vector first, because the code below modifies EMVals.
+  SmallVector<CallInst *, 4> Joins;
+  for (auto ji = EMVals.begin(), je = EMVals.end(); ji != je; ++ji) {
+    auto EMVal = *ji;
+    if (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue()) != GenXIntrinsic::genx_simdcf_join)
+      continue;
+    Joins.push_back(cast<CallInst>(EMVal.getValue()));
+  }
+  for (auto ji = Joins.begin(), je = Joins.end(); ji != je; ++ji) {
+    auto Join = *ji;
+    auto JoinBlock = Join->getParent();
+    if (GotoJoin::isJoinLabel(JoinBlock, /*SkipCriticalEdgeSplitter=*/true))
+      hoistJoin(Join);
+    else {
+      // The join is in a block that is not a join label. Also check the case
+      // that there is a predecessor that:
+      // 1. has one successor; and
+      // 2. is empty other than phi nodes; and
+      // 3. is a join label.
+      // In that case we merge the two blocks, merging phi nodes.
+      // I have seen this situation arise where LLVM decides to add a loop
+      // pre-header block.
+      BasicBlock *PredBlock = nullptr;
+      for (auto ui = JoinBlock->use_begin(), ue = JoinBlock->use_end(); ui != ue; ++ui) {
+        auto Br = dyn_cast<BranchInst>(ui->getUser());
+        if (!Br || Br->isConditional())
+          continue;
+        auto BB = Br->getParent();
+        if (BB->getFirstNonPHIOrDbg() != Br)
+          continue;
+        if (GotoJoin::isJoinLabel(BB, /*SkipCriticalEdgeSplitter=*/true)) {
+          PredBlock = BB;
+          break;
+        }
+      }
+      if (PredBlock) {
+        // We have such a predecessor block. First hoist the join in our block.
+        if (hoistJoin(Join)) {
+          // Join hoisting succeeded. Now merge the blocks.
+          LLVM_DEBUG(dbgs() << "moveCodeInJoinBlocks: merging " << PredBlock->getName()
+              << " into " << JoinBlock->getName() << "\n");
+          // First adjust the phi nodes to include both blocks' incomings.
+          for (auto Phi = dyn_cast<PHINode>(&JoinBlock->front()); Phi;
+              Phi = dyn_cast<PHINode>(Phi->getNextNode())) {
+            int Idx = Phi->getBasicBlockIndex(PredBlock);
+            if (Idx >= 0) {
+              Value *Incoming = Phi->getIncomingValue(Idx);
+              auto PredPhi = dyn_cast<PHINode>(Incoming);
+              if (PredPhi && PredPhi->getParent() != PredBlock)
+                PredPhi = nullptr;
+              if (PredPhi) {
+                // The incoming in JoinBlock is a phi node in PredBlock. Add its
+                // incomings.
+                Phi->removeIncomingValue(Idx, /*DeletePHIIfEmpty=*/false);
+                for (unsigned oi = 0, oe = PredPhi->getNumIncomingValues();
+                    oi != oe; ++oi)
+                  Phi->addIncoming(PredPhi->getIncomingValue(oi),
+                      PredPhi->getIncomingBlock(oi));
+              } else {
+                // Otherwise, add the predecessors of PredBlock to the phi node
+                // in JoinBlock.
+                for (auto ui2 = PredBlock->use_begin(),
+                    ue2 = PredBlock->use_end(); ui2 != ue2; ++ui2) {
+                  Instruction *Term = dyn_cast<Instruction>(ui2->getUser());
+                  assert(Term);
+                  if (Term->isTerminator()) {
+                    auto PredPred = Term->getParent();
+                    if (Idx >= 0) {
+                      Phi->setIncomingBlock(Idx, PredPred);
+                      Idx = -1;
+                    } else
+                      Phi->addIncoming(Incoming, PredPred);
+                  }
+                }
+              }
+            }
+          }
+          // Any phi in PredBlock that is not used in a phi in JoinBlock (and
+          // so still has at least one use after the code above) needs to be
+          // moved to JoinBlock, with itself added as the extra incomings. The
+          // incoming blocks to JoinBlock other than PredBlock must be loop
+          // back edges.
+          for (;;) {
+            auto Phi = dyn_cast<PHINode>(&PredBlock->front());
+            if (!Phi)
+              break;
+            if (Phi->use_empty()) {
+              removeFromEMRMVals(Phi);
+              Phi->eraseFromParent();
+              continue;
+            }
+            for (auto ui = JoinBlock->use_begin(), ue = JoinBlock->use_end();
+                ui != ue; ++ui) {
+              auto Term = dyn_cast<Instruction>(ui->getUser());
+              assert(Term);
+              if (!Term->isTerminator())
+                continue;
+              auto TermBB = Term->getParent();
+              if (TermBB == PredBlock)
+                continue;
+              Phi->addIncoming(Phi, TermBB);
+            }
+            Phi->removeFromParent();
+            Phi->insertBefore(&JoinBlock->front());
+          }
+          // Adjust branches targeting PredBlock to target JoinBlock instead.
+          PredBlock->replaceAllUsesWith(JoinBlock);
+          // Remove PredBlock.
+          PredBlock->eraseFromParent();
+        }
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * emptyBranchingJoinBlocksInFunc : empty other instructions out of each
+ *    block in a function that is both a join label and a branching join block
+ *
+ * See comment for moveCodeInJoinBlocks above.
+ */
+void GenXSimdCFConformance::emptyBranchingJoinBlocksInFunc(Function *F)
+{
+  for (auto i = po_begin(&F->getEntryBlock()), e = po_end(&F->getEntryBlock());
+      i != e; ++i) {
+    BasicBlock *BB = *i;
+    CallInst *Join = GotoJoin::isBranchingJoinBlock(BB);
+    if (!Join)
+      continue;
+    emptyBranchingJoinBlock(Join);
+  }
+}
+
+/***********************************************************************
+ * emptyBranchingJoinBlock : empty instructions other than the join (and its
+ *      extracts) from this branching join block
+ */
+void GenXSimdCFConformance::emptyBranchingJoinBlock(CallInst *Join)
+{
+  BasicBlock *BB = Join->getParent();
+  Instruction *InsertBefore = nullptr;
+  for (Instruction *NextInst = BB->getFirstNonPHIOrDbg();;) {
+    auto Inst = NextInst;
+    if (Inst->isTerminator())
+      break;
+    NextInst = Inst->getNextNode();
+    if (Inst == Join)
+      continue; // do not hoist the join itself
+    if (GenXIntrinsic::getGenXIntrinsicID(Inst) == GenXIntrinsic::genx_simdcf_join)
+      break; // we have encountered another join; there must be more than one
+    if (auto EV = dyn_cast<ExtractValueInst>(Inst))
+      if (EV->getOperand(0) == Join)
+        continue; // do not hoist an extract of the join
+    if (Inst->getNumUses() == 0) {
+      Inst->eraseFromParent();
+      Modified = true;
+      continue;
+    }
+    // Check that the instruction's operands do not use anything in this block
+    // (the phi nodes, or the join and extracts being left behind).
+    for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) {
+      auto Opnd = dyn_cast<Instruction>(Inst->getOperand(oi));
+      if (Opnd && Opnd->getParent() == BB) {
+        LLVM_DEBUG(dbgs() << "Failed to empty branching join label for join " << Join->getName() << "\n");
+        return; // Instruction uses something in this block: abandon.
+      }
+    }
+    if (!InsertBefore) {
+      // Lazy determination of the insert point. If it is a branching goto/join
+      // block, insert before the goto/join.
+      auto DomTree = getDomTree(BB->getParent());
+      assert(DomTree);
+      auto BBNode = DomTree->getNode(BB);
+      assert(BBNode);
+      auto InsertBB = BBNode->getIDom()->getBlock();
+      InsertBefore = GotoJoin::isBranchingGotoJoinBlock(InsertBB);
+      if (!InsertBefore)
+        InsertBefore = InsertBB->getTerminator();
+    }
+    // Hoist the instruction.
+    Inst->removeFromParent();
+    Inst->insertBefore(InsertBefore);
+    Modified = true;
+  }
+}
+
+/***********************************************************************
+ * getDomTree : get dominator tree for a function
+ */
+DominatorTree *GenXSimdCFConformance::getDomTree(Function *F)
+{
+  if (!DTWrapper) {
+    // In early pass, which is a module pass.
+    if (!DTs[F]) {
+      auto DT = new DominatorTree;
+      DT->recalculate(*F);
+      DTs[F] = DT;
+    }
+    return DTs[F];
+  }
+  // In late pass, use the DominatorTreeGroupWrapper.
+  return DTWrapper->getDomTree(F);
+}
+
+/***********************************************************************
+ * hoistJoin : hoist a join to the top of its basic block if possible
+ *
+ * Return:  whether succeeded
+ *
+ * This is used for a join in a block that is a join label, but not a branching
+ * join block. See comment for emptyJoinBlocks above.
+ *
+ * There might be multiple joins in the function, and the one supplied is not
+ * necessarily the first one. If it is a later one, this function will silently
+ * fail, which is harmless. If it silently fails for the first join, then we
+ * end up with a join label block that does not start with a join, which
+ * checkConformance will spot later on.
+ *
+ * This function does return whether it has succeeded, which is used in
+ * moveCodeInJoinBlocks in the case that it wants to merge a loop pre-header
+ * back into the join block.
+ */
+bool GenXSimdCFConformance::hoistJoin(CallInst *Join)
+{
+  // This only works if no operand of the join uses one of the instructions
+  // before it in the block, other than phi nodes.
+  // However, if we find such an instruction and it is an extractvalue from the
+  // result of an earlier goto/join in a different block, we can just move it
+  // to after that goto/join.
+  for (unsigned oi = 0, oe = Join->getNumArgOperands(); oi != oe; ++oi) {
+    auto Opnd = dyn_cast<Instruction>(Join->getOperand(oi));
+    if (!Opnd || isa<PHINode>(Opnd))
+      continue;
+    if (Opnd->getParent() == Join->getParent()) {
+      if (auto EV = dyn_cast<ExtractValueInst>(Opnd)) {
+        unsigned IID = GenXIntrinsic::getGenXIntrinsicID(EV->getOperand(0));
+        if (IID == GenXIntrinsic::genx_simdcf_goto
+            || IID == GenXIntrinsic::genx_simdcf_join) {
+          auto GotoJoin = cast<CallInst>(EV->getOperand(0));
+          if (GotoJoin->getParent() != Join->getParent()) {
+            LLVM_DEBUG(dbgs() << "moving out of join block: " << *EV << "\n");
+            EV->removeFromParent();
+            EV->insertBefore(GotoJoin->getNextNode());
+            continue;
+          }
+        }
+      }
+      LLVM_DEBUG(dbgs() << "hoistJoin: " << Join->getName() << " failed\n");
+      return false; // failed -- join uses non-phi instruction before it
+    }
+  }
+  // Hoist the join.
+  auto BB = Join->getParent();
+  auto InsertBefore = BB->getFirstNonPHIOrDbg();
+  if (InsertBefore == Join)
+    return true; // already at start
+  Join->removeFromParent();
+  Join->insertBefore(InsertBefore);
+  GotoJoinEVsMap[Join].hoistEVs();
+  Modified = true;
+  return true;
+}
+
+/***********************************************************************
+ * ensureConformance : check for conformance, and lower any non-conformant
+ *    gotos and joins
+ */
+void GenXSimdCFConformance::ensureConformance()
+{
+  // Push all EM values onto the stack for checking. Push the joins last, since
+  // we want to process those before their corresponding gotos, so that
+  // GotoJoinMap is set for a goto by the time we process a valid goto.
+  for (auto i = EMVals.begin(), e = EMVals.end(); i != e; ++i) {
+    auto IID = GenXIntrinsic::getGenXIntrinsicID(i->getValue());
+    if (IID != GenXIntrinsic::genx_simdcf_join &&
+        IID != GenXIntrinsic::genx_simdcf_unmask &&
+        IID != GenXIntrinsic::genx_simdcf_remask)
+      EMValsStack.insert(*i);
+  }
+  for (auto i = EMVals.begin(), e = EMVals.end(); i != e; ++i) {
+    auto IID = GenXIntrinsic::getGenXIntrinsicID(i->getValue());
+    if (IID == GenXIntrinsic::genx_simdcf_join)
+      EMValsStack.insert(*i);
+  } // Process the stack.
+  SmallVector<CallInst *, 4> GotosToLower;
+  SmallVector<CallInst *, 4> JoinsToLower;
+  for (;;) {
+    if (!EMValsStack.empty()) {
+      // Remove and process the top entry on the stack.
+      auto EMVal = EMValsStack.back();
+      EMValsStack.pop_back();
+      if (checkEMVal(EMVal))
+        continue;
+      removeBadEMVal(EMVal);
+      if (!EMVal.getIndex()) {
+        if (auto CI = dyn_cast<CallInst>(EMVal.getValue())) {
+          switch (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue())) {
+            case GenXIntrinsic::genx_simdcf_goto:
+              GotosToLower.push_back(CI);
+              break;
+            case GenXIntrinsic::genx_simdcf_join:
+              JoinsToLower.push_back(CI);
+              break;
+            default:
+              break;
+          }
+        }
+      }
+      continue;
+    }
+    // The stack is empty. Check for EM values interfering with each other.
+    checkEMInterference();
+    if (EMValsStack.empty()) {
+      // Stack still empty; we have finished.
+      break;
+    }
+  }
+  // In the late pass, we are not expecting to have found any non-conformant
+  // gotos and joins that need lowering. All such gotos and joins should have
+  // been identified in the early pass, unless passes in between have
+  // transformed the code in an unexpected way that has made the simd CF
+  // non-conformant. Give an error here if this has happened.
+  if (isLatePass() && (!GotosToLower.empty() || !JoinsToLower.empty()))
+    llvm_unreachable("unexpected non-conformant SIMD CF in late SIMD CF conformance pass");
+  // Lower gotos and joins that turned out to be non-conformant.
+  for (auto i = GotosToLower.begin(), e = GotosToLower.end(); i != e; ++i)
+    lowerGoto(*i);
+  for (auto i = JoinsToLower.begin(), e = JoinsToLower.end(); i != e; ++i)
+    lowerJoin(*i);
+}
+
+/***********************************************************************
+ * getEMProducer : perform recurrent check for EM terms.
+ *
+ * It goes through all phis and bitcasts (when BitCastAllowed is true)
+ * and determines whether the EM is correct in DF terms. It doesn't
+ * check live range interference, but can spot non-conformant usage
+ * in case when EM from bad instruction is being used.
+ *
+ * This approach is used when we need to perform some actions on full
+ * EM data flow, for example, to insert phis when eliminating redundant
+ * bitcasts.
+ *
+ * All found EM producers are stored in EMProducers and can be used
+ * later without performing full search.
+ *
+ * TODO: currently returns User if it deals with EM. It is done in
+ * this way as workaround for possible future changes (for example,
+ * getConnectedVals refactor). The idea of such approach is to be
+ * able to update info if something changes.
+ */
+Value *GenXSimdCFConformance::getEMProducer(Value *User, std::set<Value *> &Visited, bool BitCastAllowed)
+{
+  LLVM_DEBUG(dbgs() << "Looking for EM producer for value:\n" << *User << "\n");
+
+  if (Visited.count(User)) {
+    if (dyn_cast<PHINode>(User))
+      return User;
+    return nullptr;
+  }
+
+  // Check for previously found value
+  auto It = EMProducers.find(User);
+  if (It != EMProducers.end()) {
+    LLVM_DEBUG(dbgs() << "Using previously found value:\n" << *It->second << "\n");
+    return It->second;
+  }
+
+  if (auto C = dyn_cast<Constant>(User)) {
+    // All one is considered as EM at entry point
+    if (C->isAllOnesValue()) {
+      LLVM_DEBUG(dbgs() << "EMProducer is an AllOne constant\n");
+      EMProducers[C] = C;
+      return C;
+    }
+  } else if (auto PN = dyn_cast<PHINode>(User)) {
+    // For phi node, check all its preds. They all must be EMs
+    Visited.insert(PN);
+    for (unsigned idx = 0, opNo = PN->getNumOperands(); idx < opNo; ++idx) {
+      Value *Pred = PN->getOperand(idx);
+
+      if (!getEMProducer(Pred, Visited, BitCastAllowed)) {
+        LLVM_DEBUG(dbgs() << "!!! Bad phi pred detected for:\n" << *PN << "\n");
+        EMProducers[PN] = nullptr;
+        return nullptr;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "EMProducer is phi itself:\n" << *PN << "\n");
+    EMProducers[PN] = PN;
+    return PN;
+  } else if (auto EVI = dyn_cast<ExtractValueInst>(User)) {
+    // Extract value can be an EV from goto/join or from callee that
+    // returned it. For the second case we check that the pred is
+    // still in EM values since it could be lowered.
+    CallInst *CI = dyn_cast<CallInst>(EVI->getOperand(0));
+    if (CI) {
+      // Goto/join check
+      if (GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_simdcf_goto ||
+        GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_simdcf_join) {
+        LLVM_DEBUG(dbgs() << "Reached goto/join\n");
+        EMProducers[EVI] = EVI;
+        return EVI;
+      }
+
+      // EV from other calls.
+      if (EMVals.count(SimpleValue(CI, EVI->getIndices()[0]))) {
+        LLVM_DEBUG(dbgs() << "Value from return\n");
+        EMProducers[EVI] = EVI;
+        return EVI;
+      }
+    }
+  } else if (auto Arg = dyn_cast<Argument>(User)){
+    // For argument we need to ensure that it is still in EM values
+    // since it could be lowered.
+    if (EMVals.count(SimpleValue(Arg, Arg->getArgNo()))) {
+      LLVM_DEBUG(dbgs() << "Input argument\n");
+      EMProducers[Arg] = Arg;
+      return Arg;
+    }
+  } else if (auto IVI = dyn_cast<InsertValueInst>(User)) {
+    // Insert value prepares structure for return. Check the
+    // value that is being inserted
+    Visited.insert(IVI);
+    if (auto EMProd = getEMProducer(IVI->getInsertedValueOperand(), Visited, BitCastAllowed)) {
+      LLVM_DEBUG(dbgs() << "Insert for return\n");
+      EMProducers[IVI] = EMProd;
+      return IVI;
+    }
+  } else if (BitCastAllowed) {
+    if (auto BCI = dyn_cast<BitCastInst>(User)) {
+      // BitCast doesn't produce new EM. Just go through it.
+      Visited.insert(BCI);
+      if (auto EMProd = getEMProducer(BCI->getOperand(0), Visited, BitCastAllowed)) {
+        LLVM_DEBUG(dbgs() << "Bitcast from EM producer\n");
+        EMProducers[BCI] = EMProd;
+        return BCI;
+      }
+    }
+  }
+
+  // All other instructions cannot be treated as EM producers
+  LLVM_DEBUG(dbgs() << "!!! IT IS NOT A EM PRODUCER !!!\n");
+  return nullptr;
+}
+
+/***********************************************************************
+ * lowerUnsuitableGetEMs : remove all unsuitable get_em intrinsics.
+ *
+ * This intrinsic is unsuitable if:
+ *   - It uses constant value: it is simply redundant
+ *   - The EM argument is not actually a EM: this may happen if
+ *     SIMD CF was non-conformant and this EM was lowered.
+ */
+void GenXSimdCFConformance::lowerUnsuitableGetEMs()
+{
+  Type *I1Ty = Type::getInt1Ty(M->getContext());
+  Function *GetEMDecl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_simdcf_get_em,
+    { VectorType::get(I1Ty, 32) });
+  for (auto ui = GetEMDecl->use_begin(); ui != GetEMDecl->use_end();) {
+    std::set<Value *> Visited;
+    auto GetEM = dyn_cast<Instruction>(ui->getUser());
+    ++ui;
+    auto GetEMPred = GetEM->getOperand(0);
+
+    // Constants and non-EM values should be used directly
+    if (dyn_cast<Constant>(GetEMPred) || !getEMProducer(dyn_cast<Instruction>(GetEMPred), Visited)) {
+      GetEM->replaceAllUsesWith(GetEM->getOperand(0));
+      GetEM->eraseFromParent();
+    }
+  }
+}
+
+/***********************************************************************
+ * lowerAllSimdCF : do NOT check for conformance, and simply lower 
+ * all any gotos, joins, and unmasks
+ */
+void GenXSimdCFConformance::lowerAllSimdCF()
+{
+  for (auto i = EMVals.begin(), e = EMVals.end(); i != e; ++i) {
+    if (auto CI = dyn_cast<CallInst>(i->getValue())) {
+      auto IID = GenXIntrinsic::getGenXIntrinsicID(i->getValue());
+      if (IID == GenXIntrinsic::genx_simdcf_join)
+        lowerJoin(CI);
+      else if (IID == GenXIntrinsic::genx_simdcf_goto)
+        lowerGoto(CI);
+      else if (IID == GenXIntrinsic::genx_simdcf_unmask) {
+        auto SaveMask = CI->getArgOperand(0);
+        if (auto CI0 = dyn_cast<CallInst>(SaveMask)) {
+          IRBuilder<> Builder(CI0);
+          auto Replace = Builder.CreateBitCast(CI0->getArgOperand(0), CI0->getType());
+          CI0->replaceAllUsesWith(Replace);
+          CI0->eraseFromParent();
+        }
+        IRBuilder<> Builder(CI);
+        auto Replace = Builder.CreateBitCast(CI->getArgOperand(1), CI->getType());
+        CI->replaceAllUsesWith(Replace);
+        CI->eraseFromParent();
+      }
+      else if (IID == GenXIntrinsic::genx_simdcf_remask) {
+        IRBuilder<> Builder(CI);
+        auto Replace = Builder.CreateBitCast(CI->getArgOperand(1), CI->getType());
+        CI->replaceAllUsesWith(Replace);
+        CI->eraseFromParent();
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * checkEMVal : check an EM value for conformance
+ *
+ * Return:    true if ok, false if the EM value needs to be removed
+ */
+bool GenXSimdCFConformance::checkEMVal(SimpleValue EMVal)
+{
+  LLVM_DEBUG(dbgs() << "checkEMVal " << *EMVal.getValue() << "#" << EMVal.getIndex() << "\n");
+  if (!EnableGenXGotoJoin)
+    return false; // use of goto/join disabled
+  SmallVector<SimpleValue, 8> ConnectedVals;
+  // Check connected values. Do not lower bad users in Late Pass because
+  // current SIMD CF Conformance check approach expects that SIMD CF must
+  // be OK at this point if it wasn't lowered during Early Pass.
+  if (!getConnectedVals(EMVal, RegCategory::EM, /*IncludeOptional=*/true,
+        /*OkJoin=*/nullptr, &ConnectedVals, /*LowerBadUsers=*/!FG)) {
+    LLVM_DEBUG(dbgs() << "invalid def or uses\n");
+    return false; // something invalid about the EM value itself
+  }
+  // Check that all connected values are EM values.
+  for (auto i = ConnectedVals.begin(), e = ConnectedVals.end(); i != e; ++i) {
+    SimpleValue ConnectedVal = *i;
+    if (auto C = dyn_cast<Constant>(ConnectedVal.getValue())) {
+      if (!C->isAllOnesValue()) {
+        LLVM_DEBUG(dbgs() << "ConnectedVal is constant that is not all ones\n");
+        return false; // uses constant that is not all ones, invalid
+      }
+    } else if (!EMVals.count(ConnectedVal)) {
+      LLVM_DEBUG(dbgs() << "ConnectedVal is not in EMVals\n");
+      return false; // connected value is not in EMVals
+    }
+  }
+  switch (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue())) {
+    case GenXIntrinsic::genx_simdcf_goto:
+      return checkGoto(EMVal);
+    case GenXIntrinsic::genx_simdcf_join:
+      return checkJoin(EMVal);
+    default:
+      break;
+  }
+  return true;
+}
+
+/***********************************************************************
+ * checkGotoJoinSunk : check whether a goto/join is sunk to the bottom of
+ *    its basic block, other than extractvalues from its result
+ */
+static bool checkGotoJoinSunk(CallInst *GotoJoin)
+{
+  for (Instruction *Inst = GotoJoin;;) {
+    Inst = Inst->getNextNode();
+    if (Inst->isTerminator()) {
+      if (!isa<BranchInst>(Inst))
+        return false;
+      break;
+    }
+    auto EV = dyn_cast<ExtractValueInst>(Inst);
+    if (!EV || EV->getOperand(0) != GotoJoin)
+      return false;
+  }
+  return true;
+}
+
+/***********************************************************************
+ * checkGoto : check conformance of an actual goto instruction
+ */
+bool GenXSimdCFConformance::checkGoto(SimpleValue EMVal)
+{
+  if (!checkGotoJoin(EMVal))
+    return false;
+  // Check that there is a linked join. (We do not need to check here that the
+  // linked join is an EM value; that happened in checkEMVal due to the join
+  // being treated as a linked value in getConnectedVals.)
+  auto Goto = cast<CallInst>(EMVal.getValue());
+  if (!GotoJoinMap[Goto]) {
+    LLVM_DEBUG(dbgs() << "checkGoto: no linked join\n");
+    return false;
+  }
+  // Check that the goto is sunk to the end of the block, other than extracts
+  // from its result, and a branch. moveCodeInGotoBlocks ensures that if
+  // possible; if that failed, this conformance check fails.
+  if (!checkGotoJoinSunk(Goto)) {
+    LLVM_DEBUG(dbgs() << "checkGoto: not sunk\n");
+    return false;
+  }
+  return true;
+}
+
+/***********************************************************************
+ * checkJoin : check conformance of an actual join instruction
+ */
+bool GenXSimdCFConformance::checkJoin(SimpleValue EMVal)
+{
+  if (!checkGotoJoin(EMVal))
+    return false;
+  // Check that the join is at the start of the block. emptyJoinBlock should
+  // have ensured this, unless the code was such that it could not.
+  auto Join = cast<CallInst>(EMVal.getValue());
+  if (!GotoJoin::isValidJoin(Join)) {
+    LLVM_DEBUG(dbgs() << "not valid join\n");
+    return false;
+  }
+  // If the !any result of this join is used in a conditional branch at the
+  // end, check that the join is sunk to the end of the block, other than
+  // extracts from its result, and a branch. moveCodeInJoinBlocks ensures that
+  // if possible; if that failed, this conformance check fails.
+  if (auto Br = dyn_cast<BranchInst>(Join->getParent()->getTerminator()))
+    if (Br->isConditional())
+      if (auto EV = dyn_cast<ExtractValueInst>(Br->getCondition()))
+        if (EV->getOperand(0) == Join)
+          if (!checkGotoJoinSunk(Join)) {
+            LLVM_DEBUG(dbgs() << "checkJoin: not sunk\n");
+            return false;
+          }
+  // Gather the web of RM values.
+  auto RMValsEntry = &RMVals[Join];
+  RMValsEntry->clear();
+  LLVM_DEBUG(dbgs() << "gather web of RM vals for " << *Join << "\n");
+  if (!isa<Constant>(Join->getOperand(1)))
+    RMValsEntry->insert(Join->getOperand(1));
+  for (unsigned rvi = 0; rvi != RMValsEntry->size(); ++rvi) {
+    SimpleValue RM = (*RMValsEntry)[rvi];
+    // RM is a value in this join's RM web. Get other values related by phi
+    // nodes and extractvalues and gotos.
+    SmallVector<SimpleValue, 8> ConnectedVals;
+    bool Ok = getConnectedVals(RM, RegCategory::RM, /*IncludeOptional=*/false,
+        Join, &ConnectedVals);
+    LLVM_DEBUG(
+      dbgs() << "getConnectedVals: " << RM.getValue()->getName() << "#" << RM.getIndex() << "\n";
+      for (auto i = ConnectedVals.begin(), e = ConnectedVals.end(); i != e; ++i)
+        dbgs() << "   " << i->getValue()->getName() << "#" << i->getIndex() << "\n"
+    );
+    if (!Ok) {
+      LLVM_DEBUG(dbgs() << "illegal RM value in web\n");
+      return false;
+    }
+    for (auto j = ConnectedVals.begin(), je = ConnectedVals.end();
+        j != je; ++j) {
+      SimpleValue ConnectedVal = *j;
+      if (auto C = dyn_cast<Constant>(ConnectedVal.getValue())) {
+        // A constant in the RM web must be all zeros.
+        if (!C->isNullValue()) {
+          LLVM_DEBUG(dbgs() << "non-0 constant in RM web\n");
+          return false;
+        }
+      } else {
+        // Insert the non-constant value.  If it is a goto with struct index
+        // other than 1, it is illegal.
+        if (RMValsEntry->insert(ConnectedVal)) {
+          LLVM_DEBUG(dbgs() << "New one: " << ConnectedVal.getValue()->getName() << "#" << ConnectedVal.getIndex() << "\n");
+          switch (GenXIntrinsic::getGenXIntrinsicID(ConnectedVal.getValue())) {
+            case GenXIntrinsic::genx_simdcf_join:
+              LLVM_DEBUG(dbgs() << "multiple joins in RM web\n");
+              return false;
+            case GenXIntrinsic::genx_simdcf_goto:
+              if (ConnectedVal.getIndex() != 1/* struct index of RM result */) {
+                LLVM_DEBUG(dbgs() << "wrong struct index in goto\n");
+                return false;
+              }
+              break;
+            default:
+              break;
+          }
+        }
+      }
+    }
+  }
+  // Check whether the RM values interfere with each other.
+  SetVector<Value *> BadDefs;
+  checkInterference(RMValsEntry, &BadDefs, Join);
+  if (!BadDefs.empty()) {
+    LLVM_DEBUG(dbgs() << "RMs interfere\n");
+    return false;
+  }
+  // Set GotoJoinMap for each goto in the RM web.
+  for (unsigned rvi = 0; rvi != RMValsEntry->size(); ++rvi) {
+    SimpleValue RM = (*RMValsEntry)[rvi];
+    if (GenXIntrinsic::getGenXIntrinsicID(RM.getValue()) == GenXIntrinsic::genx_simdcf_goto)
+      GotoJoinMap[cast<CallInst>(RM.getValue())] = Join;
+  }
+  return true;
+}
+
+/***********************************************************************
+ * getEmptyCriticalEdgeSplitterSuccessor : if BB is an empty critical edge
+ *    splitter block (one predecessor and one successor), then return the
+ *    single successor
+ */
+static BasicBlock *getEmptyCriticalEdgeSplitterSuccessor(BasicBlock *BB)
+{
+  if (!BB->hasOneUse())
+    return nullptr; // not exactly one predecessor
+  auto Term = dyn_cast<Instruction>(BB->getFirstNonPHIOrDbg());
+  if (!Term->isTerminator())
+    return nullptr; // not empty
+  auto TI = cast<IGCLLVM::TerminatorInst>(Term);
+  if (TI->getNumSuccessors() != 1)
+    return nullptr; // not exactly one successor
+  return TI->getSuccessor(0);
+}
+
+/***********************************************************************
+ * checkGotoJoin : common code to check conformance of an actual goto or join
+ *    instruction
+ */
+bool GenXSimdCFConformance::checkGotoJoin(SimpleValue EMVal)
+{
+  auto CI = cast<CallInst>(EMVal.getValue());
+  // If there is an extract of the scalar result of the goto/join, check that
+  // it is used in the conditional branch at the end of the block.
+  ExtractValueInst *ExtractScalar = nullptr;
+  for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui)
+    if (auto EV = dyn_cast<ExtractValueInst>(ui->getUser()))
+      if (!isa<VectorType>(EV->getType())) {
+        if (ExtractScalar) {
+          LLVM_DEBUG(dbgs() << "goto/join has more than one extract of its !any result\n");
+          return false;
+        }
+        ExtractScalar = EV;
+      }
+  if (ExtractScalar) {
+    if (!ExtractScalar->hasOneUse()) {
+      LLVM_DEBUG(dbgs() << "goto/join's !any result does not have exactly one use\n");
+      return false;
+    }
+    auto Br = dyn_cast<BranchInst>(ExtractScalar->use_begin()->getUser());
+    if (!Br || Br->getParent() != CI->getParent()) {
+      LLVM_DEBUG(dbgs() << "goto/join's !any result not used in conditional branch in same block\n");
+      return false;
+    }
+    // For a goto/join with a conditional branch, check that the "true"
+    // successor is a join label. We also tolerate there being an empty
+    // critical edge splitter block in between; this will get removed in
+    // setCategories in this pass.
+    BasicBlock *TrueSucc = Br->getSuccessor(0);
+    Instruction *First = TrueSucc->getFirstNonPHIOrDbg();
+    if (GenXIntrinsic::getGenXIntrinsicID(First) != GenXIntrinsic::genx_simdcf_join) {
+      // "True" successor is not a join label. Check for an empty critical edge
+      // splitter block in between.
+      TrueSucc = getEmptyCriticalEdgeSplitterSuccessor(TrueSucc);
+      if (!TrueSucc) {
+        LLVM_DEBUG(dbgs() << "goto/join true successor not join label\n");
+        return false; // Not empty critical edge splitter
+      }
+      if (GenXIntrinsic::getGenXIntrinsicID(TrueSucc->getFirstNonPHIOrDbg())
+          != GenXIntrinsic::genx_simdcf_join) {
+        LLVM_DEBUG(dbgs() << "goto/join true successor not join label\n");
+        return false; // Successor is not join label
+      }
+    }
+  }
+  return true;
+}
+
+/***********************************************************************
+ * removeBadEMVal : remove a bad EM value
+ *
+ * This removes a non-conformant EM value, and pushes any connected EM value
+ * onto the stack so it gets re-checked for conformance.
+ */
+void GenXSimdCFConformance::removeBadEMVal(SimpleValue EMVal)
+{
+  LLVM_DEBUG(
+    dbgs() << "removeBadEMVal ";
+    EMVal.print(dbgs());
+    dbgs() << "\n"
+  );
+  // Remove the EM value.
+  if (!EMVals.remove(EMVal))
+    return; // was not in EMVals
+  // Push anything related to it onto the stack for re-checking.
+  SmallVector<SimpleValue, 8> ConnectedVals;
+  getConnectedVals(EMVal, RegCategory::EM, /*IncludeOptional=*/true,
+        /*OkJoin=*/nullptr, &ConnectedVals);
+  for (auto i = ConnectedVals.begin(), e = ConnectedVals.end(); i != e; ++i) {
+    SimpleValue ConnectedVal = *i;
+    if (EMVals.count(ConnectedVal))
+      EMValsStack.insert(ConnectedVal);
+  }
+}
+
+/***********************************************************************
+ * pushValues : push EM struct elements in a value onto EMValsStack
+ */
+void GenXSimdCFConformance::pushValues(Value *V)
+{
+  for (unsigned si = 0, se = IndexFlattener::getNumElements(V->getType());
+      si != se; ++si) {
+    SimpleValue SV(V, si);
+    if (EMVals.count(SV))
+      EMValsStack.insert(SV);
+  }
+}
+
+/***********************************************************************
+ * checkAllUsesAreSelectOrWrRegion : check that all uses of a value are the
+ *    condition in select or wrregion or wrpredpredregion (or a predicate
+ *    in a non-ALU intrinsic)
+ *
+ * This is used in getConnectedVals below for the result of a use of an EM
+ * value in an rdpredregion, or a shufflevector that is a slice so will be
+ * lowered to rdpredregion.
+ */
+static bool checkAllUsesAreSelectOrWrRegion(Value *V)
+{
+  for (auto ui2 = V->use_begin(); ui2 != V->use_end(); /*empty*/) {
+    auto User2 = cast<Instruction>(ui2->getUser());
+    unsigned OpNum = ui2->getOperandNo();
+    ++ui2;
+
+    if (isa<SelectInst>(User2))
+      continue;
+
+    // Matches uses that can be turned into select.
+    if (auto BI = dyn_cast<BinaryOperator>(User2)) {
+      auto Opc = BI->getOpcode();
+      Constant *AllOne = Constant::getAllOnesValue(V->getType());
+      Constant *AllNul = Constant::getNullValue(V->getType());
+
+      // EM && X -> sel EM X 0
+      // EM || X -> sel EM 1 X
+      if (Opc == BinaryOperator::And ||
+          Opc == BinaryOperator::Or) {
+        Value *Ops[3] = {V, nullptr, nullptr};
+        if (Opc == BinaryOperator::And) {
+          Ops[1] = BI->getOperand(1 - OpNum);
+          Ops[2] = AllNul;
+        } else if (Opc == BinaryOperator::Or) {
+          Ops[1] = AllOne;
+          Ops[2] = BI->getOperand(1 - OpNum);
+        }
+        auto SI = SelectInst::Create(Ops[0], Ops[1], Ops[2], ".revsel", BI, BI);
+        BI->replaceAllUsesWith(SI);
+        BI->eraseFromParent();
+        continue;
+      }
+
+      // ~EM || X ==> sel EM, X, 1
+      using namespace PatternMatch;
+      if (BI->hasOneUse() &&
+          BI->user_back()->getOpcode() == BinaryOperator::Or &&
+          match(BI, m_Xor(m_Specific(V), m_Specific(AllOne)))) {
+        Instruction *OrInst = BI->user_back();
+        Value *Op = OrInst->getOperand(0) != BI ? OrInst->getOperand(0)
+                                                : OrInst->getOperand(1);
+        auto SI = SelectInst::Create(V, Op, AllOne, ".revsel", OrInst, OrInst);
+        OrInst->replaceAllUsesWith(SI);
+        OrInst->eraseFromParent();
+        BI->eraseFromParent();
+        continue;
+      }
+
+      // ~EM && X ==> sel EM, 0, X
+      using namespace PatternMatch;
+      if (BI->hasOneUse() &&
+          BI->user_back()->getOpcode() == BinaryOperator::And &&
+          match(BI, m_Xor(m_Specific(V), m_Specific(AllOne)))) {
+        Instruction *AndInst = BI->user_back();
+        Value *Op = AndInst->getOperand(0) != BI ? AndInst->getOperand(0)
+                                                 : AndInst->getOperand(1);
+        auto SI = SelectInst::Create(V, AllNul, Op, ".revsel", AndInst, AndInst);
+        AndInst->replaceAllUsesWith(SI);
+        AndInst->eraseFromParent();
+        BI->eraseFromParent();
+        continue;
+      }
+    } else if (auto CI = dyn_cast<CastInst>(User2)) {
+      // Turn zext/sext to select.
+      if (CI->getOpcode() == Instruction::CastOps::ZExt ||
+          CI->getOpcode() == Instruction::CastOps::SExt) {
+        unsigned NElts = V->getType()->getVectorNumElements();
+        unsigned NBits = CI->getType()->getScalarSizeInBits();
+        int Val = (CI->getOpcode() == Instruction::CastOps::ZExt) ? 1 : -1;
+        APInt One(NBits, Val);
+        Constant *LHS = ConstantVector::getSplat(
+            NElts, ConstantInt::get(CI->getType()->getScalarType(), One));
+        Constant *AllNul = Constant::getNullValue(CI->getType());
+        auto SI = SelectInst::Create(V, LHS, AllNul, ".revsel", CI, CI);
+        CI->replaceAllUsesWith(SI);
+        CI->eraseFromParent();
+        continue;
+      }
+    }
+
+    unsigned IID = GenXIntrinsic::getAnyIntrinsicID(User2);
+    if (GenXIntrinsic::isWrRegion(IID))
+      continue;
+    if (IID == GenXIntrinsic::genx_wrpredpredregion
+        && OpNum == cast<CallInst>(User2)->getNumArgOperands() - 1)
+      continue;
+    if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IID)
+        && !cast<CallInst>(User2)->doesNotAccessMemory())
+      continue;
+    return false;
+  }
+  return true;
+}
+
+/***********************************************************************
+ * getConnectedVals : for a SimpleValue, get other SimpleValues connected to
+ *    it through phi nodes, insertvalue, extractvalue, goto/join, and maybe
+ *    args and return values
+ *
+ * Enter:   Val = SimpleValue to start at
+ *          Cat = RegCategory::EM to do EM connections
+ *                RegCategory::RM to do RM connections
+ *          IncludeOptional = for EM connections, include optional connections
+ *                where Val is a function arg and it is connected to call args,
+ *                and where Val is the operand to return and it is connected to
+ *                the returned value at call sites
+ *          OkJoin = for RM connections, error if a use in a join other than
+ *                this one is found
+ *          ConnectedVals = vector to store connected values in
+ *
+ * Return:  true if ok, false if def or some use is not suitable for EM/RM
+ *
+ * The provided value must be non-constant, but the returned connected values
+ * may include constants. Duplicates may be stored in ConnectedVals.
+ *
+ * This function is used in three different ways by its callers:
+ *
+ * 1. to gather a web of putative EM values or RM values starting at goto/join
+ *    instructions;
+ *
+ * 2. to test whether a putative EM/RM value is valid by whether its connected
+ *    neighbors are EM/RM values;
+ *
+ * 3. when removing a value from the EM/RM values list, to find its connected
+ *    neighbors to re-run step 2 on each of them.
+ *
+ * TODO: some refactoring should be performed here due to quite big
+ *       CF with many different actions. Also some of these actions
+ *       are repeated in different situations.
+ */
+bool GenXSimdCFConformance::getConnectedVals(SimpleValue Val, int Cat,
+    bool IncludeOptional, CallInst *OkJoin,
+    SmallVectorImpl<SimpleValue> *ConnectedVals, bool LowerBadUsers)
+{
+  // Check the def first.
+  if (auto Arg = dyn_cast<Argument>(Val.getValue())) {
+    if (Cat != RegCategory::EM)
+      return false; // can't have RM argument
+    // Connected to some return value. There is a problem here in that it might
+    // find another predicate return value that is nothing to do with SIMD CF,
+    // and thus stop SIMD CF being optimized. But passing a predicate in and
+    // out of a function is rare outside of SIMD CF, so we do not worry about
+    // that.
+    // It is possible that EM was optimized from ret. In this case the ret type
+    // is void. Allow such situation.
+    Function *F = Arg->getParent();
+    unsigned RetIdx = 0;
+    auto RetTy = F->getReturnType();
+    auto ValTy = IndexFlattener::getElementType(
+        Val.getValue()->getType(), Val.getIndex());
+    if (auto ST = dyn_cast<StructType>(RetTy)) {
+      for (unsigned End = IndexFlattener::getNumElements(ST); ; ++RetIdx) {
+        if (RetIdx == End)
+          return false; // no predicate ret value found
+        if (IndexFlattener::getElementType(ST, RetIdx) == ValTy)
+          break;
+      }
+    } else if (RetTy != ValTy && !RetTy->isVoidTy())
+      return false; // no predicate ret value found
+    if (!RetTy->isVoidTy())
+      for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi)
+        if (auto Ret = dyn_cast<ReturnInst>(fi->getTerminator()))
+          ConnectedVals->push_back(SimpleValue(Ret->getOperand(0), RetIdx));
+    if (IncludeOptional) {
+      // With IncludeOptional, also add the corresponding arg at each call
+      // site.
+      for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui)
+        if (auto CI = dyn_cast<CallInst>(ui->getUser()))
+          ConnectedVals->push_back(
+              SimpleValue(CI->getArgOperand(Arg->getArgNo()), Val.getIndex()));
+    }
+  } else if (auto Phi = dyn_cast<PHINode>(Val.getValue())) {
+    // phi: add (the corresponding struct element of) each incoming
+    for (unsigned oi = 0, oe = Phi->getNumIncomingValues(); oi != oe; ++oi)
+      ConnectedVals->push_back(
+          SimpleValue(Phi->getIncomingValue(oi), Val.getIndex()));
+  } else if (auto EVI = dyn_cast<ExtractValueInst>(Val.getValue())) {
+    // extractvalue: add the appropriate struct element of the input
+    ConnectedVals->push_back(SimpleValue(EVI->getOperand(0),
+            Val.getIndex() + IndexFlattener::flatten(
+              cast<StructType>(EVI->getOperand(0)->getType()),
+              EVI->getIndices())));
+  } else if (auto IVI = dyn_cast<InsertValueInst>(Val.getValue())) {
+    // insertvalue: add the appropriate struct element in either the
+    // aggregate input or the value to insert input
+    unsigned InsertedIndex = Val.getIndex() - IndexFlattener::flatten(
+        cast<StructType>(IVI->getType()), IVI->getIndices());
+    unsigned NumElements = IndexFlattener::getNumElements(
+        IVI->getOperand(1)->getType());
+    SimpleValue SV;
+    if (InsertedIndex < NumElements)
+      SV = SimpleValue(IVI->getOperand(1), InsertedIndex);
+    else
+      SV = SimpleValue(IVI->getOperand(0), Val.getIndex());
+    ConnectedVals->push_back(SV);
+  } else if (auto SVI = dyn_cast<ShuffleVectorInst>(Val.getValue())) {
+    // shufflevector: add the EM use
+    ConnectedVals->push_back(SimpleValue(SVI->getOperand(0), 0));
+  } else if (auto CI = dyn_cast<CallInst>(Val.getValue())) {
+    switch (GenXIntrinsic::getAnyIntrinsicID(CI)) {
+      case GenXIntrinsic::genx_simdcf_goto:
+        // goto: invalid unless it is the EM/RM result of goto as applicable
+        if (Val.getIndex() != (Cat == RegCategory::EM ? 0U : 1U))
+          return false;
+        // Add the corresponding input.
+        ConnectedVals->push_back(CI->getOperand(Val.getIndex()));
+        // If doing EM connections, add the corresponding join. This does
+        // nothing if checkJoin has not yet run for the corresponding join,
+        // since GotoJoinMap has not yet been set up for our goto. We tolerate
+        // that situation; if the goto really has no linked join, that is
+        // picked up later in checkGoto.
+        if (Cat == RegCategory::EM)
+          if (auto Join = GotoJoinMap[cast<CallInst>(Val.getValue())])
+            ConnectedVals->push_back(
+                SimpleValue(Join, 0/* struct idx of EM result */));
+        break;
+      case GenXIntrinsic::genx_simdcf_join: {
+        // join: invalid unless it is the EM result
+        if (Val.getIndex() || Cat != RegCategory::EM)
+          return false;
+        // Add the corresponding input.
+        ConnectedVals->push_back(CI->getOperand(Val.getIndex()));
+        // Add the corresponding gotos. This does nothing if checkJoin has not
+        // yet run for this join, since RMVals has not yet been set up for it.
+        // That is OK, because adding the corresponding gotos here is required
+        // only when we are called by removeBadEMVal to remove the join, so the
+        // gotos get re-checked and found to be invalid.
+        auto RMValsEntry = &RMVals[cast<CallInst>(Val.getValue())];
+        for (auto i = RMValsEntry->begin(), e = RMValsEntry->end(); i != e; ++i)
+          if (GenXIntrinsic::getGenXIntrinsicID(i->getValue()) == GenXIntrinsic::genx_simdcf_goto)
+            ConnectedVals->push_back(
+                SimpleValue(i->getValue(), 0/* struct idx of EM result */));
+        break;
+      }
+      case GenXIntrinsic::genx_simdcf_savemask:
+      case GenXIntrinsic::genx_simdcf_remask:
+      case GenXIntrinsic::genx_simdcf_get_em:
+        // Add the corresponding input.
+        ConnectedVals->push_back(CI->getOperand(0));
+        return true;
+      case GenXIntrinsic::genx_constantpred:
+        // constantpred: add the constant. Don't add any other uses of it,
+        // because it might be commoned up with other RM webs.
+        ConnectedVals->push_back(CI->getOperand(0));
+        return true;
+      case GenXIntrinsic::not_any_intrinsic: {
+        // Value returned from a call.
+        if (Cat != RegCategory::EM)
+          return false; // invalid for RM
+        // Add the corresponding value at each return in the called function.
+        auto CalledFunc = CI->getCalledFunction();
+        for (auto fi = CalledFunc->begin(), fe = CalledFunc->end();
+            fi != fe; ++fi)
+          if (auto Ret = dyn_cast<ReturnInst>(fi->getTerminator()))
+            if (!Ret->getType()->isVoidTy())
+              ConnectedVals->push_back(
+                  SimpleValue(Ret->getOperand(0), Val.getIndex()));
+        // Connected to some call arg. There is a problem here in that it might
+        // find another predicate arg that is nothing to do with SIMD CF, and
+        // thus stop SIMD CF being optimized. But passing a predicate in and
+        // out of a function is rare outside of SIMD CF, so we do not worry
+        // about that.
+        auto ValTy = IndexFlattener::getElementType(
+            Val.getType(), Val.getIndex());
+        for (unsigned Idx = 0, End = CI->getNumArgOperands(); ; ++Idx) {
+          if (Idx == End)
+            return false; // no corresponding call arg found
+          if (CI->getArgOperand(Idx)->getType() == ValTy) {
+            ConnectedVals->push_back(SimpleValue(CI->getArgOperand(Idx), 0));
+            break;
+          }
+        }
+        break;
+      }
+      default:
+        return false; // unexpected call as def
+    }
+  } else
+    return false; // unexpected instruction as def
+  // Check the uses.
+  std::vector<SimpleValue> UsersToLower;
+  for (auto ui = Val.getValue()->use_begin(),
+      ue = Val.getValue()->use_end(); ui != ue; ++ui) {
+    auto User = cast<Instruction>(ui->getUser());
+    if (auto Phi = dyn_cast<PHINode>(User)) {
+      // Use in phi node. Add the phi result.
+      ConnectedVals->push_back(SimpleValue(Phi, Val.getIndex()));
+      continue;
+    }
+    if (auto EVI = dyn_cast<ExtractValueInst>(User)) {
+      // Use in extractvalue.
+      // If extracting the right index, add the result.
+      unsigned StartIndex = IndexFlattener::flatten(
+          cast<StructType>(EVI->getOperand(0)->getType()), EVI->getIndices());
+      unsigned NumIndices = IndexFlattener::getNumElements(EVI->getType());
+      unsigned ExtractedIndex = Val.getIndex() - StartIndex;
+      if (ExtractedIndex < NumIndices)
+        ConnectedVals->push_back(SimpleValue(EVI, ExtractedIndex));
+      continue;
+    }
+    if (auto IVI = dyn_cast<InsertValueInst>(User)) {
+      // Use in insertvalue. Could be either the aggregate input or the value
+      // to insert.
+      unsigned StartIndex = IndexFlattener::flatten(
+          cast<StructType>(IVI->getType()), IVI->getIndices());
+      unsigned NumIndices = IndexFlattener::getNumElements(
+          IVI->getOperand(1)->getType());
+      if (!ui->getOperandNo()) {
+        // Use in insertvalue as the aggregate input. Add the corresponding
+        // element in the result, as long as it is not overwritten by the
+        // insertvalue.
+        if (Val.getIndex() - StartIndex >= NumIndices)
+          ConnectedVals->push_back(SimpleValue(IVI, Val.getIndex()));
+      } else {
+        // Use in insertvalue as the value to insert. Add the corresponding
+        // element in the result.
+        ConnectedVals->push_back(SimpleValue(IVI, StartIndex + Val.getIndex()));
+      }
+      continue;
+    }
+    if (isa<ReturnInst>(User)) {
+      // Use in a return.
+      if (Cat != RegCategory::EM)
+        return false; // invalid for RM
+      // Connected to some function arg. There is a problem here in that it might
+      // find another predicate arg that is nothing to do with SIMD CF, and
+      // thus stop SIMD CF being optimized. But passing a predicate in and
+      // out of a function is rare outside of SIMD CF, so we do not worry
+      // about that.
+      auto ValTy = IndexFlattener::getElementType(
+          Val.getType(), Val.getIndex());
+      auto F = User->getParent()->getParent();
+      bool Lower = false;
+      for (auto ai = F->arg_begin(), ae = F->arg_end(); ; ++ai) {
+        if (ai == ae) {
+          // no arg of the right type found
+          Lower = true;
+          UsersToLower.push_back(SimpleValue(User, ui->getOperandNo()));
+          break;
+        }
+        auto Arg = &*ai;
+        if (Arg->getType() == ValTy) {
+          ConnectedVals->push_back(SimpleValue(Arg, 0));
+          break;
+        }
+      }
+      if (IncludeOptional && !Lower) {
+        // With IncludeOptional, also add the values connected by being the
+        // return value at each call site.
+        for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui)
+          if (auto CI = dyn_cast<CallInst>(ui->getUser()))
+            ConnectedVals->push_back(SimpleValue(CI, Val.getIndex()));
+      }
+      continue;
+    }
+    if (isa<SelectInst>(User)) {
+      // A use in a select is allowed only for EM used as the condition.
+      if (Cat != RegCategory::EM || ui->getOperandNo() != 0)
+        UsersToLower.push_back(SimpleValue(User, ui->getOperandNo()));
+      continue;
+    }
+    if (auto SVI = dyn_cast<ShuffleVectorInst>(User)) {
+      if (!ShuffleVectorAnalyzer(SVI).isReplicatedSlice()) {
+        UsersToLower.push_back(SimpleValue(User, ui->getOperandNo()));
+        continue;
+      }
+      // This is a shufflevector that is a replicated slice, so it can be
+      // lowered to rdpredregion or baled with instruction with channels.
+      // (We only see this in the early pass; GenXLowering has
+      // turned it into rdpredregion by the late pass.) Check that all its uses
+      // are select or wrregion.
+      if (!checkAllUsesAreSelectOrWrRegion(SVI)) {
+        UsersToLower.push_back(SimpleValue(User, ui->getOperandNo()));
+        continue;
+      }
+      // Shufflevector produces EM for value baled inst, so this is a (almost) real EM def:
+      // add it here to perform correct EM interference check
+      ConnectedVals->push_back(SimpleValue(SVI, ui->getOperandNo()));
+      continue;
+    }
+    if (auto CI = dyn_cast<CallInst>(User)) {
+      switch (GenXIntrinsic::getAnyIntrinsicID(CI)) {
+        case GenXIntrinsic::genx_simdcf_get_em:
+          assert(Cat == RegCategory::EM);
+          // Skip it if the category is right. This
+          // intrinsic doesn't produce EM
+          break;
+        case GenXIntrinsic::genx_simdcf_unmask:
+        case GenXIntrinsic::genx_simdcf_remask:
+          assert(Cat == RegCategory::EM);
+          ConnectedVals->push_back(SimpleValue(CI, 0));
+          break;
+        case GenXIntrinsic::genx_simdcf_goto:
+          // use in goto: valid only if arg 0 (EM) or 1 (RM)
+          if (ui->getOperandNo() != (Cat == RegCategory::EM ? 0U : 1U))
+            return false;
+          // Add corresponding result.
+          ConnectedVals->push_back(SimpleValue(CI, ui->getOperandNo()));
+          break;
+        case GenXIntrinsic::genx_simdcf_join:
+          // use in join: valid only if arg 0 (EM) or 1 (RM)
+          if (ui->getOperandNo() != (Cat == RegCategory::EM ? 0U : 1U))
+            return false;
+          // If EM, add corresponding result.
+          if (Cat == RegCategory::EM)
+            ConnectedVals->push_back(SimpleValue(CI, 0));
+          else if (OkJoin && OkJoin != CI) {
+            // RM value used in a join other than OkJoin. That is illegal, as we
+            // can only have one join per RM web.
+            LLVM_DEBUG(dbgs() << "getConnectedVals: found illegal join: " << CI->getName() << "\n");
+            return false;
+          }
+          break;
+        case GenXIntrinsic::genx_wrregionf:
+        case GenXIntrinsic::genx_wrregioni:
+          break; // Use as wrregion predicate is allowed.
+        case GenXIntrinsic::genx_rdpredregion:
+          // We only see rdpredregion in the late pass; in the early pass it is
+          // still a shufflevector.  Check that all its uses are select or
+          // wrregion.
+          if (!checkAllUsesAreSelectOrWrRegion(CI))
+            UsersToLower.push_back(SimpleValue(User, ui->getOperandNo()));
+          break;
+        case GenXIntrinsic::genx_wrpredpredregion:
+          // Use in wrpredpredregion allowed as the last arg.
+          if (ui->getOperandNo() + 1 != CI->getNumArgOperands())
+            UsersToLower.push_back(SimpleValue(User, ui->getOperandNo()));
+          break;
+        default:
+          // Allowed as an predicate in a non-ALU intrinsic.
+          if (CI->getCalledFunction()->doesNotAccessMemory())
+            UsersToLower.push_back(SimpleValue(User, ui->getOperandNo()));
+          break;
+        case GenXIntrinsic::not_any_intrinsic: {
+          // Use in subroutine call. Add the corresponding function arg.
+          Function *CalledFunc = CI->getCalledFunction();
+          assert(CalledFunc);
+          auto ai = CalledFunc->arg_begin();
+          for (unsigned Count = ui->getOperandNo(); Count; --Count, ++ai)
+            ;
+          Argument *Arg = &*ai;
+          ConnectedVals->push_back(SimpleValue(Arg, Val.getIndex()));
+          // Connected to some return value from the call. There is a problem
+          // here in that it might find another predicate return value that is
+          // nothing to do with SIMD CF, and thus stop SIMD CF being optimized.
+          // But passing a predicate in and out of a function is rare outside
+          // of SIMD CF, so we do not worry about that.
+          unsigned RetIdx = 0;
+          auto ValTy = IndexFlattener::getElementType(
+              Val.getValue()->getType(), Val.getIndex());
+          if (auto ST = dyn_cast<StructType>(CI->getType())) {
+            for (unsigned End = IndexFlattener::getNumElements(ST); ; ++RetIdx) {
+              if (RetIdx == End)
+                UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); // no predicate ret value found
+              if (IndexFlattener::getElementType(ST, RetIdx) == ValTy) {
+                ConnectedVals->push_back(SimpleValue(CI, RetIdx));
+                break;
+              }
+            }
+          } else if (CI->getType() == ValTy)
+            ConnectedVals->push_back(SimpleValue(CI, 0));
+          else if (!CI->getType()->isVoidTy())
+            UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); // no predicate ret value found
+          break;
+        }
+      }
+      continue;
+    }
+    UsersToLower.push_back(SimpleValue(User, ui->getOperandNo()));
+  }
+
+  if (LowerBadUsers) {
+    SetVector<Value *> ToRemove;
+    for (auto BadUser : UsersToLower) {
+      replaceUseWithLoweredEM(dyn_cast<Instruction>(BadUser.getValue()),
+          BadUser.getIndex(), ToRemove);
+    }
+    for (auto Inst : ToRemove) {
+      removeFromEMRMVals(Inst);
+    }
+  } else {
+    if (!UsersToLower.empty())
+      return false;
+  }
+
+  return true;
+}
+
+// check if this is an EM value or part of an EM value.
+static bool isEM(Value *V) {
+  if (auto SI = dyn_cast<ShuffleVectorInst>(V))
+    return isEM(SI->getOperand(0)) || isEM(SI->getOperand(1));
+  return GotoJoin::isEMValue(V);
+}
+
+// canonicalizeEM : canonicalize EM uses so that EM uses will not
+// stop SIMD-CF conformance.
+void GenXSimdCFConformance::canonicalizeEM() {
+  using namespace PatternMatch;
+  std::vector<Instruction *> DeadInstructions;
+
+  for (auto &F : M->getFunctionList())
+    for (auto &BB : F.getBasicBlockList()) {
+      for (Instruction *Inst = BB.getTerminator(); Inst;) {
+        // select(C0&C1, a, b) -> select(C0, select(C1, a, b), b)
+        // select(C0|C1, a, b) -> select(C0, a, select(C1, a, b))
+        Value *C0, *C1, *A, *B;
+        if (match(Inst, m_Select(m_BinOp(m_Value(C0), m_Value(C1)), m_Value(A),
+                                 m_Value(B)))) {
+          bool C1IsEM = isEM(C1);
+          if (C1IsEM || isEM(C0)) {
+            Value *Cond = Inst->getOperand(0);
+            if (Cond->getType()->isVectorTy()) {
+              BinaryOperator *BO = cast<BinaryOperator>(Cond);
+              // Set Inst as insert point in order to save dominance
+              IRBuilder<> Builder(Inst);
+              if (C1IsEM)
+                std::swap(C0, C1);
+              if (BO->getOpcode() == BinaryOperator::And) {
+                Value *V = Builder.CreateSelect(C1, A, B);
+                V = Builder.CreateSelect(C0, V, B);
+                Inst->replaceAllUsesWith(V);
+                DeadInstructions.push_back(Inst);
+              } else if (BO->getOpcode() == BinaryOperator::Or) {
+                Value *V = Builder.CreateSelect(C1, A, B);
+                V = Builder.CreateSelect(C0, A, V);
+                Inst->replaceAllUsesWith(V);
+                DeadInstructions.push_back(Inst);
+              }
+            }
+          }
+        }
+
+        Inst = (Inst == &BB.front()) ? nullptr : Inst->getPrevNode();
+      }
+    }
+
+  for (Instruction *I : DeadInstructions)
+    RecursivelyDeleteTriviallyDeadInstructions(I);
+
+  // Collect data for gotos/joins EVs
+  handleEVs();
+  // Resolve bitcast chains so they don't break conformance
+  resolveBitCastChains();
+}
+
+/***********************************************************************
+ * handleEVs : collect goto/join EVs and perform some transformations
+ * on them.
+ *
+ * All transformations are done in GotoJoinEVs constructor.
+ */
+void GenXSimdCFConformance::handleEVs()
+{
+  // Collect gotos/joins
+  gatherGotoJoinEMVals(false);
+  for (auto val : EMVals) {
+    Value *GotoJoin = val.getValue();
+    assert(GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_goto ||
+      GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_join);
+    GotoJoinEVsMap[GotoJoin] = GotoJoinEVs(GotoJoin);
+  }
+  EMVals.clear();
+}
+
+/***********************************************************************
+ * eliminateBitCastPreds : perform bitcast elimination on EM DF
+ *
+ * GetEMPred should be called earlier to check if Val is actually
+ * a EM producer.
+ */
+Value *GenXSimdCFConformance::eliminateBitCastPreds(Value *Val, std::set<Value *> &DeadInst, std::set<Value *> &Visited)
+{
+  Type *EMType = VectorType::get(Type::getInt1Ty(M->getContext()), 32);
+
+  if (Visited.count(Val))
+  {
+    return EMProducers[Val];
+  }
+
+  Visited.insert(Val);
+
+  if (auto BCI = dyn_cast<BitCastInst>(Val)) {
+    assert(EMProducers[BCI] == BCI->getOperand(0) && "Bad EM producer was saved!");
+
+    DeadInst.insert(BCI);
+    return eliminateBitCastPreds(BCI->getOperand(0), DeadInst, Visited);
+  } else if (auto PN = dyn_cast<PHINode>(Val)) {
+    assert(EMProducers[PN] == PN && "Bad EM producer was saved!");
+
+    PHINode *NewPN = nullptr;
+    if (PN->getType() != EMType) {
+      // Different type at phi. This may happen if its incoming value
+      // became bitcast.
+      LLVM_DEBUG(dbgs() << "Creating new PHI for:\n" << *PN << "\n");
+      NewPN = PHINode::Create(EMType, PN->getNumIncomingValues(), "EMTerm", PN);
+      EMProducers[NewPN] = NewPN;
+      // In case of cycle, we will return newly created phi
+      EMProducers[PN] = NewPN;
+      // Phi can become redundant after it
+      DeadInst.insert(PN);
+    }
+
+    for (unsigned oi = 0, on = PN->getNumIncomingValues(); oi < on; ++oi) {
+      auto EMProd = eliminateBitCastPreds(PN->getIncomingValue(oi), DeadInst, Visited);
+      if (!NewPN) {
+        PN->setIncomingValue(oi, EMProd);
+        PN->setIncomingBlock(oi, PN->getIncomingBlock(oi));
+      } else {
+        NewPN->addIncoming(EMProd, PN->getIncomingBlock(oi));
+      }
+    }
+
+    return NewPN ? NewPN : PN;
+  } else if (auto C = dyn_cast<Constant>(Val)) {
+    assert(C->isAllOnesValue() && "Should be checked before!");
+    assert(EMProducers[C] == C && "Bad EM producer was saved!");
+
+    return Constant::getAllOnesValue(EMType);
+  } else {
+    assert(Val && EMProducers[Val] == Val && "Bad EM producer was saved!");
+    assert(Val->getType() == EMType && "Unexpected final EM producer!");
+
+    return Val;
+  }
+}
+
+/***********************************************************************
+ * resolveBitCastChains : resolve EM -> (bitcast) -> EM chains
+ *
+ * Standard LLVM passes create such chains sometimes and it makes
+ * SIMD CF non-conformant. Here we check this and make changes to
+ * resolve it if possible. If it is not, SIMD CF remains non-conformant
+ * and is lowered later.
+ */
+void GenXSimdCFConformance::resolveBitCastChains()
+{
+  LLVM_DEBUG(dbgs() << "Resolving Bitcast chains:\n");
+
+  // We don't have EM values here so we have to gather them
+  // here, too. This is because we can change EM values set
+  // during these transformations.
+  gatherEMVals();
+
+  std::set<Value *> DeadInst;
+  for (auto Val : EMVals) {
+    if (auto PN = dyn_cast<PHINode>(Val.getValue())) {
+      LLVM_DEBUG(dbgs() << "Found phi:\n" << *PN << "\n");
+    } else if (auto BCI = dyn_cast<BitCastInst>(Val.getValue())) {
+      LLVM_DEBUG(dbgs() << "Found bitcast:\n" << *BCI << "\n");
+    } else
+      continue;
+
+    std::set<Value *> Visited;
+    Instruction *I = dyn_cast<Instruction>(Val.getValue());
+    Value *EMProd = getEMProducer(I, Visited, true);
+
+    if (!EMProd) {
+      LLVM_DEBUG(dbgs() << "!!! Not EM producer was detected when resolving bitcast chains !!!\n");
+      continue;
+    }
+
+    Visited.clear();
+    Value *NewEMProd = eliminateBitCastPreds(EMProd, DeadInst, Visited);
+    if (NewEMProd != EMProd) {
+      EMProd->replaceAllUsesWith(NewEMProd);
+    }
+  }
+
+  EMVals.clear();
+
+  for (auto DI : DeadInst) {
+    if (auto I = dyn_cast<Instruction>(DI))
+      RecursivelyDeleteTriviallyDeadInstructions(I);
+  }
+
+  // TODO: since we are using EMProducers only here and during get_em check,
+  // clean it after these transformation sinse it may contain dead data.
+  EMProducers.clear();
+
+  LLVM_DEBUG(dbgs() << "Done resolving bitcast chains:\n");
+}
+
+/***********************************************************************
+ * checkEMInterference : check for EM values interfering with each other,
+ *      lowering gotos/joins as necessary
+ *
+ * There is only one EM in the hardware, and we need to model that by ensuring
+ * that our multiple EM values, including phi nodes, do not interfere with each
+ * other. This is effectively a register allocator with only one register.
+ */
+void GenXSimdCFConformance::checkEMInterference()
+{
+  // Do an interference check, returning a list of defs that appear in the live
+  // range of other values.
+  SetVector<Value *> BadDefs;
+  checkInterference(&EMVals, &BadDefs, nullptr);
+  for (auto i = BadDefs.begin(), e = BadDefs.end(); i != e; ++i)
+    removeBadEMVal(*i);
+}
+
+/***********************************************************************
+ * replaceUseWithLoweredEM : lower incoming EM for user.
+ *
+ * EM is being lowered via genx_simdcf_get_em intrinsic.
+ */
+void GenXSimdCFConformance::replaceUseWithLoweredEM(Instruction *Val, unsigned operandNo, SetVector<Value *> &ToRemove)
+{
+  Value *EM = Val->getOperand(operandNo);
+
+  LLVM_DEBUG(dbgs() << "Replacing EM use:\n" << *EM << "\nwith lowered EM for:\n" << *Val << "\n");
+
+  if (auto EVI = dyn_cast<ExtractValueInst>(EM)) {
+    CallInst *GotoJoin = dyn_cast<CallInst>(EVI->getOperand(0));
+    assert(GotoJoin && (GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_goto ||
+      GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_join));
+    Type *Tys[] = { EVI->getType() };
+    Function *GetEMDecl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_simdcf_get_em, Tys);
+    // The CFG was corrected for SIMD CF by earlier transformations
+    // so isBranchingGotoJoinBlock works correctly here.
+    if (GotoJoin::isBranchingGotoJoinBlock(GotoJoin->getParent()) == GotoJoin) {
+      // For branching case, we need to create false and true value
+      BasicBlock *DefBB = GotoJoin->getParent();
+      BasicBlock *TrueBlock = DefBB->getTerminator()->getSuccessor(0);
+      BasicBlock *FalseBlock = DefBB->getTerminator()->getSuccessor(1);
+
+      Value *TrueVal = Constant::getNullValue(EVI->getType());
+      Value *FalseVal = CallInst::Create(GetEMDecl, { EVI }, "getEM", FalseBlock->getFirstNonPHI());
+
+      LLVM_DEBUG(dbgs() << "Built GetEM for Branching goto/join:\n" << *FalseVal << "\n");
+
+      std::map<BasicBlock *, Value *> foundVals;
+      BasicBlockEdge TrueEdge(DefBB, TrueBlock);
+      BasicBlockEdge FalseEdge(DefBB, FalseBlock);
+      auto newPred = findGotoJoinVal(RegCategory::EM, Val->getParent(), EVI,
+        TrueEdge, FalseEdge, TrueVal, FalseVal, foundVals);
+      Val->setOperand(operandNo, newPred);
+    } else {
+      // Non-branching case: must be join. Insert get_em right after join's EM
+      assert(GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_join &&
+        "Gotos should be turned into branching earlier!");
+      auto GetEM = CallInst::Create(GetEMDecl, { EVI }, "getEM", EVI->getParent());
+      LLVM_DEBUG(dbgs() << "Built GetEM for simple join:\n" << *GetEM << "\n");
+      GetEM->moveAfter(EVI);
+      Val->setOperand(operandNo, GetEM);
+    }
+  } else if (auto SVI = dyn_cast<ShuffleVectorInst>(EM)) {
+    // Shuffle vector: got through it and lower its pred
+    replaceUseWithLoweredEM(SVI, 0, ToRemove);
+  } else if (auto PN = dyn_cast<PHINode>(EM)) {
+    // The saddest case: for phi we need to lower all its preds
+    auto newPN = PN->clone();
+    newPN->insertAfter(PN);
+    for (unsigned idx = 0, op_no = newPN->getNumOperands(); idx < op_no; ++idx) {
+      replaceUseWithLoweredEM(newPN, idx, ToRemove);
+    }
+
+    Val->setOperand(operandNo, newPN);
+  } else if (auto Arg = dyn_cast<Argument>(EM)) {
+    // Create get_em at function enter. This may happen if argument's user
+    // is moved under SIMD CF due to some reason.
+    Type *Tys[] = { Arg->getType() };
+    Function *GetEMDecl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_simdcf_get_em, Tys);
+    auto GetEM = CallInst::Create(GetEMDecl, { Arg }, "getEM", Arg->getParent()->front().getFirstNonPHI());
+    Val->setOperand(operandNo, GetEM);
+  } else
+    // All other instructions should not be EM producers with correct DF
+    assert("Failed to lower EM!");
+
+  ToRemove.insert(Val);
+}
+
+/***********************************************************************
+ * canUseLoweredEM : check whether instruction can use lowered EM
+ *
+ * Lowered EM is an explicit value that can be consumed by any
+ * instruction except of goto and join because they take implicit EM.
+ */
+bool GenXSimdCFConformance::canUseLoweredEM(Instruction *Val)
+{
+  if (GenXIntrinsic::getGenXIntrinsicID(Val) == GenXIntrinsic::genx_simdcf_goto ||
+    GenXIntrinsic::getGenXIntrinsicID(Val) == GenXIntrinsic::genx_simdcf_join)
+    return false;
+
+  // For phi, check that it does not deal with goto or join.
+  if (auto PN = dyn_cast<PHINode>(Val)) {
+    for (unsigned idx = 0, opNo = PN->getNumIncomingValues(); idx < opNo; ++idx) {
+      auto Inst = dyn_cast<ExtractValueInst>(PN->getOperand(idx));
+      if (Inst) {
+        auto Pred = Inst->getOperand(0);
+        if (GenXIntrinsic::getGenXIntrinsicID(Pred) == GenXIntrinsic::genx_simdcf_goto ||
+          GenXIntrinsic::getGenXIntrinsicID(Pred) == GenXIntrinsic::genx_simdcf_join)
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/***********************************************************************
+ * checkInterference : check for a list of values interfering with each other
+ *
+ * Enter:   Vals = values to check (not constants)
+ *          BadDefs = SetVector in which to store any def that is found in the
+ *                    live range of another def
+ *          ConstStop = instruction to treat as the def point of a constantpred,
+ *                      nullptr to treat the start of the function as the def
+ *                      point
+ *
+ * This code finds interference by scanning back from uses, finding other defs,
+ * relying on the dominance property of SSA. Having found that two EM values A
+ * and B interfere due to the def of A appearing in the live range of B, we
+ * could choose either one to lower its goto and join. In fact we choose A (the
+ * found def), as that tends to lower inner SIMD CF, giving a chance for the
+ * outer SIMD CF to become legal.
+ *
+ * Because GenXSimdCFConformance runs before live ranges are determined, so
+ * that it can modify code as it wants, we cannot use the normal interference
+ * testing code in GenXLiveness.
+ *
+ * The idea of ConstStop is different depending on whether we are testing
+ * interference of all EM values, or all RM values for a particular join:
+ *
+ * * For interference between all EM values, any constant (input to
+ *   constantpred intrinsic) must be all ones, which is checked elsewhere. It
+ *   represents the state of the execution mask at the start of the function,
+ *   therefore we need to pretend that the constantpred's live range extends
+ *   back to the start of the function.  This is done by the caller setting
+ *   ConstStop to 0.
+ *
+ * * For interference between all RM values for one particular join, any
+ *   constant must be all zeros, which is checked elsewhere. It represents the
+ *   state of that join's resume mask on entry to the function, and just after
+ *   executing the join. Therefore we need to pretend that the constantpred's
+ *   live range extends back to those two places. This is done by the caller
+ *   setting ConstStop to the join instruction.
+ */
+void GenXSimdCFConformance::checkInterference(SetVector<SimpleValue> *Vals,
+    SetVector<Value *> *BadDefs, Instruction *ConstStop)
+{
+  // Scan the live range of each value, looking for a def of another value.
+  // Finding such a def indicates interference.
+  SetVector<Value *> ToRemove;
+  for (auto evi = Vals->begin(), eve = Vals->end(); evi != eve; ++evi) {
+    Value *EMVal = evi->getValue();
+    bool IsConstantPred = GenXIntrinsic::getGenXIntrinsicID(EMVal) == GenXIntrinsic::genx_constantpred;
+    // Set of blocks where we know the value is live out.
+    SmallSet<BasicBlock *, 8> LiveOut;
+    // Start from each use and scan backwards.
+    for (auto ui = EMVal->use_begin(), ue = EMVal->use_end(); ui != ue;) {
+      auto User = cast<Instruction>(ui->getUser());
+      auto OpNo = ui->getOperandNo();
+      ++ui;
+      if (auto EVI = dyn_cast<ExtractValueInst>(User)) {
+        // Ignore a use that is an extractvalue not involving the right struct
+        // index.
+        unsigned StartIndex = IndexFlattener::flatten(
+            cast<StructType>(EVI->getOperand(0)->getType()), EVI->getIndices());
+        unsigned NumIndices = IndexFlattener::getNumElements(EVI->getType());
+        if (evi->getIndex() - StartIndex >= NumIndices)
+          continue;
+      }
+      BasicBlock *PhiPred = nullptr;
+      if (auto Phi = dyn_cast<PHINode>(User))
+        PhiPred = Phi->getIncomingBlock(OpNo);
+      auto Inst = User;
+      SmallVector<BasicBlock *, 4> PendingBBStack;
+      for (;;) {
+        if (!Inst) {
+          // Go on to the next pending predecessor.
+          if (PendingBBStack.empty())
+            break;
+          Inst = PendingBBStack.back()->getTerminator();
+          PendingBBStack.pop_back();
+        }
+        if (&Inst->getParent()->front() == Inst) {
+          // Reached the start of the block. Make all unprocessed predecessors
+          // pending. Except if the use is in a phi node and this is the first
+          // time we reach the start of a block: in that case, mark only the
+          // corresponding phi block is pending.
+          if (PhiPred) {
+            if (LiveOut.insert(PhiPred).second)
+              PendingBBStack.push_back(PhiPred);
+            PhiPred = nullptr;
+          } else {
+            for (auto bui = Inst->getParent()->use_begin(),
+                bue = Inst->getParent()->use_end(); bui != bue; ++bui) {
+              auto Pred = cast<Instruction>(bui->getUser())->getParent();
+              if (LiveOut.insert(Pred).second)
+                PendingBBStack.push_back(Pred);
+            }
+          }
+          Inst = nullptr;
+          continue;
+        }
+        // Go back to the previous instruction. (This happens even when
+        // starting at the end of a new block, thus skipping scanning the uses
+        // of the terminator, but that's OK because the terminator never uses
+        // our EM or RM values.)
+        Inst = Inst->getPrevNode();
+        if (Inst == EMVal && !IsConstantPred) {
+          // Reached the def of the value. Stop scanning, unless the def is
+          // constantpred, in which case we pretend it was live from the
+          // ConstStop.
+          Inst = nullptr;
+          continue;
+        }
+        if (Inst == ConstStop && IsConstantPred) {
+          // For a constantpred value, we have reached the point that we want
+          // to treat as its definition point.  Stop scanning.
+          Inst = nullptr;
+          continue;
+        }
+        // Check if this is the def of some other EM value.
+        if (auto VT = dyn_cast<VectorType>(Inst->getType()))
+          if (VT->getElementType()->isIntegerTy(1))
+            if (Vals->count(Inst) && !ToRemove.count(Inst)) {
+              // It is the def of some other EM value. Mark that one as
+              // interfering. However do not mark it if both values are
+              // constantpred, since we pretend all of those are defined at the
+              // start of the function.
+              if (!IsConstantPred
+                  || GenXIntrinsic::getGenXIntrinsicID(Inst) != GenXIntrinsic::genx_constantpred) {
+                LLVM_DEBUG(dbgs() << "GenXSimdCFConformance::checkInterference: def of " << Inst->getName() << " found in live range of " << EMVal->getName() << "\n");
+                auto SVI = dyn_cast<ShuffleVectorInst>(Inst);
+                if (SVI && SVI->getOperand(0) == EMVal) {
+                  // Shuffle vector is baled as EM of another size: this check is to
+                  // ensure that the EM in SVI is still actual
+                  LLVM_DEBUG(dbgs() << "\tShuffle vector with correct arg, skipping it\n");
+                } else if (canUseLoweredEM(User) && !FG) {
+                  // Lower EM in Early Pass
+                  replaceUseWithLoweredEM(User, OpNo, ToRemove);
+                  LLVM_DEBUG(dbgs() << "\tSucceded to lower EM for that use\n");
+                } else {
+                  LLVM_DEBUG(dbgs() << "\t!!! Failed to lower EM for that use: def will be lowered\n");
+                  BadDefs->insert(Inst);
+                }
+                // Done for that use
+                break;
+              }
+            }
+      }
+    }
+  }
+
+  for (auto Inst : ToRemove) {
+    removeFromEMRMVals(Inst);
+  }
+}
+
+/***********************************************************************
+ * insertCond : insert a vector of i1 value into the start of another one
+ *
+ * Enter:   OldVal = value to insert into
+ *          NewVal = value to insert, at index 0
+ *          Name = name for any new instruction
+ *          InsertBefore = where to insert any new instruction
+ *          DL = debug loc to give any new instruction
+ *
+ * Return:  value, possibly the same as the input value
+ */
+Value *GenXSimdCFConformance::insertCond(Value *OldVal, Value *NewVal,
+    const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL)
+{
+  unsigned OldWidth = OldVal->getType()->getVectorNumElements();
+  unsigned NewWidth = NewVal->getType()->getVectorNumElements();
+  if (OldWidth == NewWidth)
+    return NewVal;
+  // Do the insert with shufflevector. We need two shufflevectors, one to extend
+  // NewVal to OldVal's width, and one to combine them.
+  // GenXLowering decides whether this is suitable to lower to wrpredregion, or
+  // needs to be lowered to something less efficient.
+  SmallVector<Constant *, 32> Indices;
+  Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext());
+  unsigned i;
+  for (i = 0; i != NewWidth; ++i)
+    Indices.push_back(ConstantInt::get(I32Ty, i));
+  auto UndefIndex = UndefValue::get(I32Ty);
+  for (; i != OldWidth; ++i)
+    Indices.push_back(UndefIndex);
+  auto SV1 = new ShuffleVectorInst(NewVal, UndefValue::get(NewVal->getType()),
+      ConstantVector::get(Indices), NewVal->getName() + ".extend", InsertBefore);
+  SV1->setDebugLoc(DL);
+  if (isa<UndefValue>(OldVal))
+    return SV1;
+  Indices.clear();
+  for (i = 0; i != NewWidth; ++i)
+    Indices.push_back(ConstantInt::get(I32Ty, i + OldWidth));
+  for (; i != OldWidth; ++i)
+    Indices.push_back(ConstantInt::get(I32Ty, i));
+  auto SV2 = new ShuffleVectorInst(OldVal, SV1, ConstantVector::get(Indices),
+      Name, InsertBefore);
+  SV2->setDebugLoc(DL);
+  return SV2;
+}
+
+/***********************************************************************
+ * truncateCond : truncate a vector of i1 value
+ *
+ * Enter:   In = input value
+ *          Ty = type to truncate to
+ *          Name = name for any new instruction
+ *          InsertBefore = where to insert any new instruction
+ *          DL = debug loc to give any new instruction
+ *
+ * Return:  value, possibly the same as the input value
+ */
+Value *GenXSimdCFConformance::truncateCond(Value *In, Type *Ty,
+    const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL)
+{
+  unsigned InWidth = In->getType()->getVectorNumElements();
+  unsigned TruncWidth = Ty->getVectorNumElements();
+  if (InWidth == TruncWidth)
+    return In;
+  // Do the truncate with shufflevector. GenXLowering lowers it to rdpredregion.
+  SmallVector<Constant *, 32> Indices;
+  Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext());
+  unsigned i;
+  for (i = 0; i != TruncWidth; ++i)
+    Indices.push_back(ConstantInt::get(I32Ty, i));
+  auto SV = new ShuffleVectorInst(In, UndefValue::get(In->getType()),
+      ConstantVector::get(Indices), Name, InsertBefore);
+  SV->setDebugLoc(DL);
+  return SV;
+}
+
+/***********************************************************************
+ * lowerGoto : lower a llvm.genx.simdcf.goto
+ *
+ * This also outputs a warning that we failed to optimize a SIMD branch.
+ * We always output it, rather than including it in the -rpass mechanism
+ * to enable or disable the warning, as it is an unexpected situation that
+ * we want our users to report.
+ */
+void GenXSimdCFConformance::lowerGoto(CallInst *Goto)
+{
+  LLVM_DEBUG(dbgs() << "lowerGoto: " << *Goto << "\n");
+  const DebugLoc &DL = Goto->getDebugLoc();
+  if (EnableGenXGotoJoin && !lowerSimdCF)
+    DiagnosticInfoSimdCF::emit(Goto, "failed to optimize SIMD branch", DS_Warning);
+  Value *Results[3];
+  auto EM = Goto->getOperand(0);
+  auto Cond = Goto->getOperand(2);
+  // EM is always 32 bit. Extract SubEM, of the same width as Cond, from it.
+  auto OldSubEM = truncateCond(EM, Cond->getType(),
+      EM->getName() + ".sub", Goto, DL);
+  // Result 1: NewRM = OldRM | (SubEM & ~Cond)
+  auto NotCond = BinaryOperator::Create(Instruction::Xor, Cond,
+      Constant::getAllOnesValue(Cond->getType()),
+      Goto->getName() + ".notcond", Goto);
+  NotCond->setDebugLoc(DL);
+  auto NotCondAndSubEM = BinaryOperator::Create(Instruction::And, NotCond,
+      OldSubEM, Goto->getName() + ".disabling", Goto);
+  NotCondAndSubEM->setDebugLoc(DL);
+  Value *OldRM = Goto->getArgOperand(1);
+  auto NewRM = BinaryOperator::Create(Instruction::Or, OldRM, NotCondAndSubEM,
+      Goto->getName() + ".newRM", Goto);
+  NewRM->setDebugLoc(DL);
+  Results[1] = NewRM;
+  // And SubEM with Cond.
+  auto SubEM = BinaryOperator::Create(Instruction::And, OldSubEM, Cond,
+      Goto->getName() + ".subEM", Goto);
+  SubEM->setDebugLoc(DL);
+  // Insert that back into EM. That is result 0.
+  Results[0] = EM = insertCond(EM, SubEM, Goto->getName() + ".EM", Goto, DL);
+  // Result 2: BranchCond = !any(SubEM)
+  Function *AnyFunc = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_any,
+      SubEM->getType());
+  auto Any = CallInst::Create(AnyFunc, SubEM,
+      SubEM->getName() + ".any", Goto);
+  Any->setDebugLoc(DL);
+  auto Not = BinaryOperator::Create(Instruction::Xor, Any,
+      Constant::getAllOnesValue(Any->getType()),
+      Any->getName() + ".not", Goto);
+  Not->setDebugLoc(DL);
+  Results[2] = Not;
+  // Replace uses.
+  replaceGotoJoinUses(Goto, Results);
+  Goto->eraseFromParent();
+  Modified = true;
+}
+
+/***********************************************************************
+ * lowerJoin : lower a llvm.genx.simdcf.join
+ */
+void GenXSimdCFConformance::lowerJoin(CallInst *Join)
+{
+  LLVM_DEBUG(dbgs() << "lowerJoin: " << *Join << "\n");
+  const DebugLoc &DL = Join->getDebugLoc();
+  Value *Results[2];
+  auto EM = Join->getOperand(0);
+  auto RM = Join->getOperand(1);
+  // EM is always 32 bit. Extract SubEM, of the same width as RM, from it.
+  auto OldSubEM = truncateCond(EM, RM->getType(), EM->getName() + ".sub",
+      Join, DL);
+  // Or it with RM.
+  auto SubEM = BinaryOperator::Create(Instruction::Or, OldSubEM, RM,
+      Join->getName() + ".subEM", Join);
+  SubEM->setDebugLoc(DL);
+  // Insert that back into EM. That is result 0.
+  Results[0] = EM = insertCond(EM, SubEM, Join->getName() + ".EM", Join, DL);
+  // Result 1: BranchCond = !any(SubEM)
+  Function *AnyFunc = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_any,
+      SubEM->getType());
+  auto Any = CallInst::Create(AnyFunc, SubEM,
+      SubEM->getName() + ".any", Join);
+  Any->setDebugLoc(DL);
+  auto Not = BinaryOperator::Create(Instruction::Xor, Any,
+      Constant::getAllOnesValue(Any->getType()),
+      Any->getName() + ".not", Join);
+  Not->setDebugLoc(DL);
+  Results[1] = Not;
+  // Replace uses.
+  replaceGotoJoinUses(Join, Results);
+  Join->eraseFromParent();
+  Modified = true;
+}
+
+/***********************************************************************
+ * replaceGotoJoinUses : replace uses of goto/join
+ *
+ * The goto and join intrinsics have multiple return values in a struct.
+ * This attempts to find the extractvalues and replace those directly.
+ * It also spots where a value is unused.
+ */
+void GenXSimdCFConformance::replaceGotoJoinUses(CallInst *GotoJoin,
+    ArrayRef<Value *> Vals)
+{
+  SmallVector<ExtractValueInst *, 4> Extracts;
+  for (auto ui = GotoJoin->use_begin(), ue = GotoJoin->use_end();
+      ui != ue; ++ui) {
+    auto Extract = dyn_cast<ExtractValueInst>(ui->getUser());
+    if (Extract)
+      Extracts.push_back(Extract);
+  }
+  for (auto ei = Extracts.begin(), ee = Extracts.end(); ei != ee; ++ei) {
+    auto Extract = *ei;
+    unsigned Index = Extract->getIndices()[0];
+    if (Index >= Vals.size())
+      continue;
+    Extract->replaceAllUsesWith(Vals[Index]);
+    Extract->eraseFromParent();
+  }
+  if (!GotoJoin->use_empty()) {
+    // There are still some uses of the original goto/join. We need to
+    // aggregate the result values into a struct.
+    Value *StructVal = UndefValue::get(GotoJoin->getType());
+    Instruction *InsertBefore = GotoJoin->getNextNode();
+    for (unsigned Index = 0,
+        End = cast<StructType>(GotoJoin->getType())->getNumElements();
+        Index != End; ++Index)
+      StructVal = InsertValueInst::Create(StructVal, Vals[Index],
+          Index, "", InsertBefore);
+    GotoJoin->replaceAllUsesWith(StructVal);
+  } else {
+    // Remove code for unused value. This is particularly useful at an outer
+    // join, where the !any(NewEM) is unused, so we don't need to compute it.
+    for (unsigned vi = 0; vi != Vals.size(); ++vi) {
+      Value *V = Vals[vi];
+      while (V && V->use_empty()) {
+        auto I = dyn_cast<Instruction>(V);
+        if (I == nullptr)
+          continue;
+        unsigned NumOperands = I->getNumOperands();
+        if (auto CI = dyn_cast<CallInst>(I))
+          NumOperands = CI->getNumArgOperands();
+        V = nullptr;
+        if (NumOperands == 1)
+          V = I->getOperand(0);
+        I->eraseFromParent();
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * setCategories : set webs of EM and RM values to category EM or RM
+ *
+ * This also modifies EM uses as needed.
+ */
+void GenXLateSimdCFConformance::setCategories()
+{
+  // First the EM values.
+  for (auto ei = EMVals.begin(); ei != EMVals.end(); /* empty */) {
+    SimpleValue EMVal = *ei;
+    ei++;
+    // For this EM value, set its category and modify its uses.
+    Liveness->getOrCreateLiveRange(EMVal)->setCategory(RegCategory::EM);
+    LLVM_DEBUG(dbgs() << "Set category for:\n" << *EMVal.getValue() << "\n");
+    if (!isa<StructType>(EMVal.getValue()->getType()))
+      modifyEMUses(EMVal.getValue());
+    switch (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue())) {
+      case GenXIntrinsic::genx_simdcf_join: {
+        // For a join, set the category of each RM value.
+        auto RMValsEntry = &RMVals[cast<CallInst>(EMVal.getValue())];
+        for (auto vi = RMValsEntry->begin(), ve = RMValsEntry->end(); vi != ve; ++vi) {
+          SimpleValue RMVal = *vi;
+          // For this RM value, set its category.
+          Liveness->getOrCreateLiveRange(RMVal)->setCategory(RegCategory::RM);
+        }
+      }
+      // Fall through...
+      case GenXIntrinsic::genx_simdcf_goto: {
+        // See if this is a branching goto/join where the "true" successor is
+        // an empty critical edge splitter block.
+        auto CI = cast<CallInst>(EMVal.getValue());
+        BasicBlock *BB = CI->getParent();
+        if (GotoJoin::isBranchingGotoJoinBlock(BB) == CI) {
+          BasicBlock *TrueSucc = BB->getTerminator()->getSuccessor(0);
+          if (BasicBlock *TrueSuccSucc
+              = getEmptyCriticalEdgeSplitterSuccessor(TrueSucc)) {
+            for (auto i = TrueSucc->begin(); i != TrueSucc->end(); /*empty*/) {
+              Instruction *Inst = &*i++;
+              auto Phi = dyn_cast<PHINode>(Inst);
+              if (!Phi)
+                break;
+              if (Phi->getNumIncomingValues() == 1) {
+                Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
+                Liveness->eraseLiveRange(Phi);
+                removeFromEMRMVals(Phi);
+                Phi->eraseFromParent();
+              }
+            }
+            // now BB should be truely empty
+            assert(TrueSucc->front().isTerminator() &&
+                   "BB is not empty for removal");
+            // For a branching goto/join where the "true" successor is an empty
+            // critical edge splitter block, remove the empty block, to ensure
+            // that the "true" successor is a join label.
+            // Adjust phi nodes in TrueSuccSucc.
+            adjustPhiNodesForBlockRemoval(TrueSuccSucc, TrueSucc);
+            // Replace the use (we know there is only the one).
+            BB->getTerminator()->setSuccessor(0, TrueSuccSucc);
+            // Erase the critical edge splitter block.
+            TrueSucc->eraseFromParent();
+            Modified = true;
+          }
+        }
+        break;
+      }
+      default:
+        break;
+    }
+  }
+}
+
+/***********************************************************************
+ * modifyEMUses : modify EM uses as needed
+ */
+void GenXLateSimdCFConformance::modifyEMUses(Value *EM)
+{
+  LLVM_DEBUG(dbgs() << "modifyEMUses: " << EM->getName() << "\n");
+  // Gather the selects we need to modify, at the same time as handling other
+  // uses of the EM values.
+  SmallVector <SelectInst *, 4> Selects;
+  SmallVector <Value *, 4> EMs;
+  EMs.push_back(EM);
+  for (unsigned ei = 0; ei != EMs.size(); ++ei) {
+    EM = EMs[ei];
+    // Scan EM's uses.
+    for (auto ui = EM->use_begin(), ue = EM->use_end(); ui != ue; ++ui) {
+      auto User = cast<Instruction>(ui->getUser());
+      if (auto Sel = dyn_cast<SelectInst>(User)) {
+        assert(!ui->getOperandNo());
+        Selects.push_back(Sel);
+      } else switch (GenXIntrinsic::getAnyIntrinsicID(User)) {
+        case GenXIntrinsic::genx_rdpredregion:
+          // An rdpredregion of the EM. Find its uses in select too.
+          EMs.push_back(User);
+          break;
+#ifndef NDEBUG
+        case GenXIntrinsic::genx_simdcf_goto:
+        case GenXIntrinsic::genx_simdcf_join:
+        case GenXIntrinsic::genx_simdcf_get_em:
+          break;
+        case GenXIntrinsic::genx_wrregioni:
+        case GenXIntrinsic::genx_wrregionf:
+          assert(ui->getOperandNo() == GenXIntrinsic::GenXRegion::PredicateOperandNum);
+          break;
+        case GenXIntrinsic::genx_wrpredpredregion:
+          break;
+        default:
+          if (isa<ReturnInst>(User) || isa<InsertValueInst>(User)
+            || isa<ExtractValueInst>(User))
+            break;
+          assert(!cast<CallInst>(User)->getCalledFunction()->doesNotAccessMemory()
+            && "unexpected ALU intrinsic use of EM");
+          break;
+        case GenXIntrinsic::not_any_intrinsic:
+          assert((isa<PHINode>(User) || isa<InsertValueInst>(User) ||
+                  isa<CallInst>(User) || isa<ReturnInst>(User) ||
+                  isa<ShuffleVectorInst>(User)) &&
+                 "unexpected use of EM");
+#endif
+      }
+    }
+  }
+  // Modify each select into a predicated wrregion.
+  for (auto si = Selects.begin(), se = Selects.end(); si != se; ++si) {
+    auto Sel = *si;
+    Value *FalseVal = Sel->getFalseValue();
+    if (auto C = dyn_cast<Constant>(FalseVal)) {
+      if (!isa<UndefValue>(C)) {
+        // The false value needs loading if it is a constant other than
+        // undef.
+        SmallVector<Instruction *, 4> AddedInstructions;
+        FalseVal = ConstantLoader(C, nullptr, &AddedInstructions).loadBig(Sel);
+        // ConstantLoader generated at least one instruction.  Ensure that
+        // each one has debug loc and category.
+        for (auto aii = AddedInstructions.begin(), aie = AddedInstructions.end();
+            aii != aie; ++aii) {
+          Instruction *I = *aii;
+          I->setDebugLoc(Sel->getDebugLoc());
+        }
+      }
+    }
+    Region R(Sel);
+    R.Mask = Sel->getCondition();
+    assert(FalseVal);
+    Value *Wr = R.createWrRegion(FalseVal, Sel->getTrueValue(),
+          Sel->getName(), Sel, Sel->getDebugLoc());
+    Sel->replaceAllUsesWith(Wr);
+    Liveness->eraseLiveRange(Sel);
+    Sel->eraseFromParent();
+    Modified = true;
+  }
+}
+
+/***********************************************************************
+ * GotoJoinEVs::GotoJoinEVs : collects and handle EVs. See CollectEVs
+ * for more info.
+ */
+GenXSimdCFConformance::GotoJoinEVs::GotoJoinEVs(Value* GJ) {
+  GotoJoin = GJ;
+
+  if (!GotoJoin)
+    return;
+
+  switch (GenXIntrinsic::getGenXIntrinsicID(GotoJoin)) {
+  case GenXIntrinsic::genx_simdcf_goto:
+    IsGoto = true;
+    break;
+  case GenXIntrinsic::genx_simdcf_join:
+    IsGoto = false;
+    break;
+  default:
+    assert(false && "Expected goto or join!");
+    break;
+  }
+
+  CollectEVs();
+}
+
+/***********************************************************************
+ * GotoJoinEVs::getEMEV : get EV for goto/join Execution Mask
+ */
+ExtractValueInst *GenXSimdCFConformance::GotoJoinEVs::getEMEV() const {
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+  return EVs[EMPos];
+}
+
+/***********************************************************************
+ * GotoJoinEVs::getRMEV : get EV for goto/join Resume Mask
+ */
+ExtractValueInst *GenXSimdCFConformance::GotoJoinEVs::getRMEV() const {
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+  assert(IsGoto && "Only goto returns RM!");
+  return EVs[RMPos];
+}
+
+/***********************************************************************
+ * GotoJoinEVs::getCondEV : get EV for goto/join condition
+ */
+ExtractValueInst *GenXSimdCFConformance::GotoJoinEVs::getCondEV() const {
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+  return IsGoto ? EVs[GotoCondPos] : EVs[JoinCondPos];
+}
+
+Value *GenXSimdCFConformance::GotoJoinEVs::getGotoJoin() const {
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+  return GotoJoin;
+}
+
+/***********************************************************************
+ * GotoJoinEVs::getSplitPoint : find first instruction that is not
+ * a EV or doesn't use Goto/Join. Such instruction always exists
+ * in a correct IR - BB terminator is a such instruction.
+ */
+ Instruction *GenXSimdCFConformance::GotoJoinEVs::getSplitPoint() const {
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+  Instruction *SplitPoint = cast<Instruction>(GotoJoin)->getNextNode();
+  for (; isa<ExtractValueInst>(SplitPoint) && SplitPoint->getOperand(0) == GotoJoin;
+    SplitPoint = SplitPoint->getNextNode());
+  return SplitPoint;
+ }
+
+/***********************************************************************
+ * GotoJoinEVs::setCondEV : set EV for goto/join condition. It is
+ * needed on basic block splitting to handle bad Cond EV user.
+ */
+void GenXSimdCFConformance::GotoJoinEVs::setCondEV(ExtractValueInst *CondEV) {
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+  assert(!getCondEV() && "CondEV is already set!");
+  if (IsGoto)
+    EVs[GotoCondPos] = CondEV;
+  else
+    EVs[JoinCondPos] = CondEV;
+}
+
+/***********************************************************************
+ * GotoJoinEVs::isGoto : check wether this EVs info belongs to goto
+ */
+bool GenXSimdCFConformance::GotoJoinEVs::isGoto() const {
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+  return IsGoto;
+}
+
+/***********************************************************************
+ * GotoJoinEVs::isJoin : check wether this EVs info belongs to join
+ */
+bool GenXSimdCFConformance::GotoJoinEVs::isJoin() const {
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+  return !IsGoto;
+}
+
+/***********************************************************************
+ * GotoJoindEVs::CollectEVs : handle and store goto/join EVs
+ *
+ * This does the following steps:
+ *  - Locate EVs. If we found a duplicate, just replace users.
+ *  - Move EVs right after the goto/join
+ *  - Add missing EM and RM. This is needed for correct liverange
+ *    interference analysis.
+ */
+void GenXSimdCFConformance::GotoJoinEVs::CollectEVs() {
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+  assert((GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_goto ||
+    GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_join) &&
+    "Expected goto or join!");
+
+  auto GotoJoinInst = dyn_cast<Instruction>(GotoJoin);
+
+  // Collect EVs, hoist them, resolve duplications
+  for (auto ui = GotoJoin->use_begin(), ue = GotoJoin->use_end(); ui != ue;) {
+
+    auto EV = dyn_cast<ExtractValueInst>(ui->getUser());
+    ++ui;
+
+    assert(EV && "Bad user of goto/join!");
+    assert(EV->getNumIndices() == 1 && "Expected 1 index in Extract Value for goto/join!");
+
+    unsigned idx = EV->getIndices()[0];
+#ifndef NDEBUG
+    switch (idx) {
+    case EMPos:
+    case RMPos: // same as JoinCondPos
+      break;
+    case GotoCondPos:
+      if (IsGoto)
+        break;
+    default:
+      assert(false && "Bad index in ExtractValue for goto/join!");
+      break;
+    }
+#endif
+
+    LLVM_DEBUG(dbgs() << "Found EV:\n" << *EV << "\n");
+    if (EVs[idx]) {
+      LLVM_DEBUG(dbgs() << "Duplication: replacing users with:\n" << *EVs[idx] << "\n");
+      EV->replaceAllUsesWith(EVs[idx]);
+      EV->eraseFromParent();
+    }
+    else {
+      LLVM_DEBUG(dbgs() << "Saving it.\n");
+      EVs[idx] = EV;
+    }
+  }
+
+  // Add missing EVs for masks
+  for (unsigned idx = 0, end = IsGoto ? RMPos : EMPos; idx <= end; ++idx) {
+    if (EVs[idx])
+      continue;
+
+    std::string Name = "missing";
+    switch (idx) {
+    case EMPos:
+      Name += "EMEV";
+      break;
+    case RMPos:
+      Name += "RMEV";
+      break;
+    case GotoCondPos:
+      Name += "CondEV";
+      break;
+    }
+
+    auto EV = ExtractValueInst::Create(GotoJoin, { idx }, Name, GotoJoinInst->getParent());
+    EVs[idx] = EV;
+  }
+
+  hoistEVs();
+}
+
+/***********************************************************************
+ * GotoJoinEVs::hoistEVs : move EVs right after goto/join
+ */
+void GenXSimdCFConformance::GotoJoinEVs::hoistEVs() const{
+  assert(GotoJoin && "Uninitialized GotoJoinEVs Data!");
+
+  LLVM_DEBUG(dbgs() << "Moving EV users after:\n" << *GotoJoin << "\n");
+
+  for (unsigned idx = 0, num = PosNum; idx < num; ++idx) {
+    if (EVs[idx])
+      EVs[idx]->moveAfter(dyn_cast<Instruction>(GotoJoin));
+  }
+}
+
+/***********************************************************************
+ * DiagnosticInfoSimdCF::emit : emit an error or warning
+ */
+void DiagnosticInfoSimdCF::emit(Instruction *Inst, StringRef Msg,
+        DiagnosticSeverity Severity)
+{
+  DiagnosticInfoSimdCF Err(Severity, *Inst->getParent()->getParent(),
+      Inst->getDebugLoc(), Msg);
+  Inst->getContext().diagnose(Err);
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.cpp
new file mode 100644
index 000000000000..33454d6780d7
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.cpp
@@ -0,0 +1,145 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file implements the GenX specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GenXSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_MC_DESC
+#include "GenXGenSubtargetInfo.inc"
+
+static cl::opt<bool>
+    StackScratchMem("stack-scratch-mem",
+                    cl::desc("Specify what surface should be used for stack"),
+                    cl::init(true));
+static cl::opt<unsigned> StackMemSize("stack-mem-size",
+                                      cl::desc("Available space for stack"),
+                                      cl::init(8 * 1024));
+
+void GenXSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+
+  DumpRegAlloc = false;
+  EmitCisa = false;
+  HasLongLong = false;
+  DisableJmpi = false;
+  DisableVectorDecomposition = false;
+  WarnCallable = false;
+  OCLRuntime = false;
+
+  if (StackScratchMem)
+    StackSurf = PreDefined_Surface::PREDEFINED_SURFACE_T255;
+  else
+    StackSurf = PreDefined_Surface::PREDEFINED_SURFACE_STACK;
+  StackSurfMaxSize = StackMemSize;
+  UseGlobalMem = false;
+
+  GenXVariant = llvm::StringSwitch<GenXTag>(CPU)
+    .Case("HSW", GENX_HSW)
+    .Case("BDW", GENX_BDW)
+    .Case("CHV", GENX_CHV)
+    .Case("SKL", GENX_SKL)
+    .Case("BXT", GENX_BXT)
+    .Case("KBL", GENX_KBL)
+    .Case("GLK", GENX_GLK)
+    .Case("CNL", GENX_CNL)
+    .Case("ICLLP", GENX_ICLLP)
+    .Case("TGLLP", GENX_TGLLP)
+    .Default(GENX_SKL);
+
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  ParseSubtargetFeatures(CPUName, FS);
+}
+
+GenXSubtarget::GenXSubtarget(const Triple &TT, const std::string &CPU,
+                             const std::string &FS)
+    : GenXGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT) {
+
+  resetSubtargetFeatures(CPU, FS);
+}
+
+StringRef GenXSubtarget::getEmulateFunction(const Instruction *Inst) const {
+  StringRef EmuFnName;
+  if (emulateIDivRem()) {
+    unsigned Opcode = Inst->getOpcode();
+    switch (Opcode) {
+    default:
+      break;
+    case BinaryOperator::SDiv:
+      EmuFnName = "__cm_intrinsic_impl_sdiv";
+      break;
+    case BinaryOperator::SRem:
+      EmuFnName = "__cm_intrinsic_impl_srem";
+      break;
+    case BinaryOperator::UDiv:
+      EmuFnName = "__cm_intrinsic_impl_udiv";
+      break;
+    case BinaryOperator::URem:
+      EmuFnName = "__cm_intrinsic_impl_urem";
+      break;
+    }
+  }
+  return EmuFnName;
+}
+
+GenXSubtargetPass::GenXSubtargetPass() : ImmutablePass(ID), ST(nullptr) {}
+GenXSubtargetPass::GenXSubtargetPass(GenXSubtarget &ST)
+    : ImmutablePass(ID), ST(&ST) {}
+GenXSubtargetPass::~GenXSubtargetPass() {}
+
+char GenXSubtargetPass::ID = 0;
+
+namespace llvm {
+
+void initializeGenXSubtargetPassPass(PassRegistry &);
+
+ImmutablePass *createGenXSubtargetPass(GenXSubtarget &ST) {
+  initializeGenXSubtargetPassPass(*PassRegistry::getPassRegistry());
+  return new GenXSubtargetPass(ST);
+}
+
+} // namespace llvm
+
+INITIALIZE_PASS_BEGIN(GenXSubtargetPass, "GenXSubtargetPass", "GenXSubtargetPass", false, true)
+INITIALIZE_PASS_END(GenXSubtargetPass, "GenXSubtargetPass", "GenXSubtargetPass", false, true)
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.h
new file mode 100644
index 000000000000..2f7b158faa8d
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.h
@@ -0,0 +1,293 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXSubtarget : subtarget information
+/// -------------------------------------
+/// 
+/// GenXSubtarget is the GenX-specific subclass of TargetSubtargetInfo. It takes
+/// features detected by the front end (what the Gen architecture is), 
+/// and exposes flags to the rest of the GenX backend for
+/// various features (e.g. whether 64 bit operations are supported).
+///
+/// Where subtarget features are used is noted in the documentation of GenX
+/// backend passes.
+///
+/// The flags exposed to the rest of the GenX backend are as follows. Most of
+/// these are currently not used.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef GENXSUBTARGET_H
+#define GENXSUBTARGET_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "visa_igc_common_header.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#define GET_SUBTARGETINFO_ENUM
+#include "GenXGenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class Instruction;
+class StringRef;
+class TargetMachine;
+
+class GenXSubtarget final : public GenXGenSubtargetInfo {
+
+protected:
+  // TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  enum GenXTag {
+    GENX_GENERIC,
+    GENX_HSW,
+    GENX_BDW,
+    GENX_CHV,
+    GENX_SKL,
+    GENX_BXT,
+    GENX_KBL,
+    GENX_GLK,
+    GENX_CNL,
+    GENX_ICLLP,
+    GENX_TGLLP,
+  };
+
+  // GenXVariant - GenX Tag identifying the variant to compile for
+  GenXTag GenXVariant;
+
+private:
+  // DumpRegAlloc - True if we should dump register allocation information
+  bool DumpRegAlloc;
+
+  // EmitCisa Builder - True if we should generate CISA instead of VISA
+  bool EmitCisa;
+
+  // HasLongLong - True if subtarget supports long long type
+  bool HasLongLong;
+
+  // DisableJmpi - True if jmpi is disabled.
+  bool DisableJmpi;
+
+  // DisableVectorDecomposition - True if vector decomposition is disabled.
+  bool DisableVectorDecomposition;
+
+  // Only generate warning when callable is used in the middle of the kernel
+  bool WarnCallable;
+
+  // True if codegenerating for OCL runtime.
+  bool OCLRuntime;
+
+  // Shows which surface should we use for stack
+  PreDefined_Surface StackSurf;
+  // Limit in bytes for stack purposes
+  unsigned StackSurfMaxSize;
+
+  bool UseGlobalMem;
+
+public:
+  // This constructor initializes the data members to match that
+  // of the specified triple.
+  //
+  GenXSubtarget(const Triple &TT, const std::string &CPU,
+                const std::string &FS);
+
+  // hasLongLong - true for Gen8+
+  bool hasLongLong() { return HasLongLong; }
+
+  unsigned getGRFWidth() const { return 32; }
+
+  bool isOCLRuntime() const { return OCLRuntime; }
+
+  // ParseSubtargetFeatures - Parses features string setting specified
+  // subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  // \brief Reset the features for the GenX target.
+  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+
+public:
+
+  /// * isHSW - true if target is HSW
+  bool isHSW() const { return GenXVariant == GENX_HSW; }
+
+  /// * isBDW - true if target is BDW
+  bool isBDW() const { return GenXVariant == GENX_BDW; }
+
+  /// * isBDWplus - true if target is BDW or later
+  bool isBDWplus() const { return GenXVariant >= GENX_BDW; }
+
+  /// * isCHV - true if target is CHV
+  bool isCHV() const { return GenXVariant == GENX_CHV; }
+
+  /// * isSKL - true if target is SKL
+  bool isSKL() const { return GenXVariant == GENX_SKL; }
+
+  /// * isSKLplus - true if target is SKL or later
+  bool isSKLplus() const { return GenXVariant >= GENX_SKL; }
+
+  /// * isBXT - true if target is BXT
+  bool isBXT() const { return GenXVariant == GENX_BXT; }
+
+
+  /// * isKBL - true if target is KBL
+  bool isKBL() const { return GenXVariant == GENX_KBL; }
+
+  /// * isGLK - true if target is GLK
+  bool isGLK() const { return GenXVariant == GENX_GLK; }
+
+  /// * isCNL - true if target is CNL
+  bool isCNL() const { return GenXVariant == GENX_CNL; }
+
+  /// * isCNLplus - true if target is CNL or later
+  bool isCNLplus() const { return GenXVariant >= GENX_CNL; }
+
+  /// * isICLLP - true if target is ICL LP
+  bool isICLLP() const { return GenXVariant == GENX_ICLLP; }
+  /// * isTGLLP - true if target is TGL LP
+  bool isTGLLP() const { return GenXVariant == GENX_TGLLP; }
+
+  /// * emulateIDivRem - true if emulates integer division and reminder.
+  bool emulateIDivRem() const { return GenXVariant >= GENX_TGLLP; }
+
+  /// * dumpRegAlloc - true if we should dump Reg Alloc info
+  bool dumpRegAlloc() const { return DumpRegAlloc; }
+
+  /// * hasLongLong - true if target supports long long
+  bool hasLongLong() const { return HasLongLong; }
+
+  /// * disableJmpi - true if jmpi is disabled.
+  bool disableJmpi() const { return DisableJmpi; }
+
+  /// * WaNoA32ByteScatteredStatelessMessages - true if there is no A32 byte
+  ///   scatter stateless message.
+  bool WaNoA32ByteScatteredStatelessMessages() const { return !isCNLplus(); }
+
+  /// * disableVectorDecomposition - true if vector decomposition is disabled.
+  bool disableVectorDecomposition() const { return DisableVectorDecomposition; }
+
+  /// * warnCallable() - true if compiler only generate warning for 
+  ///   callable in the middle
+  bool warnCallable() const { return WarnCallable; }
+
+  /// * hasIndirectGRFCrossing - true if target supports an indirect region
+  ///   crossing one GRF boundary
+  bool hasIndirectGRFCrossing() const { return isSKLplus(); }
+
+  bool useGlobalMem() const { return UseGlobalMem; }
+
+  void setUseGlobalMem() {
+    assert(hasLongLong() && isOCLRuntime() &&
+           "Global mem stack can't be used on 32-bit targets or on CMRT");
+    UseGlobalMem = true;
+  }
+
+  /// * getEmulateFunction - return the corresponding emulation function name,
+  ///   empty string if no emulation is needed.
+  StringRef getEmulateFunction(const Instruction *Inst) const;
+
+  // Generic helper functions...
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+
+  bool isTargetWindowsMSVC() const {
+    return TargetTriple.isWindowsMSVCEnvironment();
+  }
+
+  bool isTargetKnownWindowsMSVC() const {
+    return TargetTriple.isKnownWindowsMSVCEnvironment();
+  }
+
+  bool isTargetWindowsCygwin() const {
+    return TargetTriple.isWindowsCygwinEnvironment();
+  }
+
+  bool isTargetWindowsGNU() const {
+    return TargetTriple.isWindowsGNUEnvironment();
+  }
+
+  bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+
+  bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
+  TARGET_PLATFORM getVisaPlatform() const {
+    switch (GenXVariant) {
+    case GENX_BDW:
+      return TARGET_PLATFORM::GENX_BDW;
+    case GENX_CHV:
+      return TARGET_PLATFORM::GENX_CHV;
+    case GENX_SKL:
+      return TARGET_PLATFORM::GENX_SKL;
+    case GENX_BXT:
+      return TARGET_PLATFORM::GENX_BXT;
+    case GENX_CNL:
+      return TARGET_PLATFORM::GENX_CNL;
+    case GENX_ICLLP:
+      return TARGET_PLATFORM::GENX_ICLLP;
+    case GENX_TGLLP:
+      return TARGET_PLATFORM::GENX_TGLLP;
+    // TODO: Unfortunately, the finalizer doesn't support all platforms, so we
+    // map any unsupported platforms to the most appropriate supported one.
+    // See also getFinalizerPlatform function in GenX.cpp
+    case GENX_KBL:
+      return TARGET_PLATFORM::GENX_SKL;
+    case GENX_GLK:
+      return TARGET_PLATFORM::GENX_BXT;
+    default:
+      return TARGET_PLATFORM::GENX_NONE;
+    }
+  }
+
+  /// * stackSurface - return a surface that should be used for stack.
+  PreDefined_Surface stackSurface() const { return StackSurf; }
+
+  /// * stackSurfaceMaxSize - return available space in bytes for stack
+  /// purposes.
+  unsigned stackSurfaceMaxSize() const { return StackSurfMaxSize; }
+};
+
+class GenXSubtargetPass : public ImmutablePass {
+  GenXSubtarget *ST;
+public:
+  GenXSubtargetPass();
+  GenXSubtargetPass(GenXSubtarget &ST);
+  ~GenXSubtargetPass();
+  GenXSubtarget *getSubtarget() { return ST; }
+  static char ID;
+};
+
+ImmutablePass *createGenXSubtargetPass(GenXSubtarget &ST);
+
+} // End llvm namespace
+
+#endif
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.cpp
new file mode 100644
index 000000000000..76cc1f1b2054
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.cpp
@@ -0,0 +1,546 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file defines the GenX specific subclass of TargetMachine.
+//
+/// Non-pass classes
+/// ================
+///
+/// This section documents some GenX backend classes and abstractions that are not
+/// in themselves passes, but are used by the passes.
+///
+/// .. include:: GenXAlignmentInfo.h
+///
+/// .. include:: GenXRegion.h
+///
+/// .. include:: GenXSubtarget.h
+///
+/// Pass documentation
+/// ==================
+///
+/// The GenX backend runs the following passes on LLVM IR:
+///
+/// .. contents::
+///    :local:
+///    :depth: 1
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "GenXTargetMachine.h"
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXModule.h"
+#include "GenXOCLRuntimeInfo.h"
+#include "vc/GenXOpts/GenXOpts.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DumpRegAlloc("genx-dump-regalloc", cl::init(false), cl::Hidden,
+                  cl::desc("Enable dumping of GenX liveness and register allocation to a file."));
+
+static cl::opt<bool> EmitVLoadStore(
+    "genx-emit-vldst", cl::init(true), cl::Hidden,
+    cl::desc("Emit load/store intrinsic calls for pass-by-ref arguments"));
+
+// There's another copy of DL string in clang/lib/Basic/Targets.cpp
+static std::string getDL(bool Is64Bit) {
+  return Is64Bit ? "e-p:64:64-i64:64-n8:16:32" : "e-p:32:32-i64:64-n8:16:32";
+}
+
+namespace llvm {
+//===----------------------------------------------------------------------===//
+// This function is required to add GenX passes to opt tool
+//===----------------------------------------------------------------------===//
+void initializeGenXPasses(PassRegistry &registry) {
+  initializeFunctionGroupAnalysisPass(registry);
+  initializeGenXAddressCommoningPass(registry);
+  initializeGenXArgIndirectionPass(registry);
+  initializeGenXCategoryPass(registry);
+  initializeGenXCFSimplificationPass(registry);
+  initializeGenXCisaBuilderPass(registry);
+  initializeGenXCoalescingPass(registry);
+  initializeGenXDeadVectorRemovalPass(registry);
+  initializeGenXDepressurizerPass(registry);
+  initializeGenXEarlySimdCFConformancePass(registry);
+  initializeGenXEmulatePass(registry);
+  initializeGenXExtractVectorizerPass(registry);
+  initializeGenXFuncBalingPass(registry);
+  initializeGenXGEPLoweringPass(registry);
+  initializeGenXGroupBalingPass(registry);
+  initializeGenXIMadPostLegalizationPass(registry);
+  initializeGenXLateSimdCFConformancePass(registry);
+  initializeGenXLayoutBlocksPass(registry);
+  initializeGenXLegalizationPass(registry);
+  initializeGenXLiveRangesPass(registry);
+  initializeGenXLivenessPass(registry);
+  initializeGenXLivenessPass(registry);
+  initializeGenXLowerAggrCopiesPass(registry);
+  initializeGenXLoweringPass(registry);
+  initializeGenXModulePass(registry);
+  initializeGenXNumberingPass(registry);
+  initializeGenXPatternMatchPass(registry);
+  initializeGenXPostLegalizationPass(registry);
+  initializeGenXPromotePredicatePass(registry);
+  initializeGenXRawSendRipperPass(registry);
+  initializeGenXReduceIntSizePass(registry);
+  initializeGenXRegionCollapsingPass(registry);
+  initializeGenXRematerializationPass(registry);
+  initializeGenXSubtargetPassPass(registry);
+  initializeGenXThreadPrivateMemoryPass(registry);
+  initializeGenXUnbalingPass(registry);
+  initializeGenXVisaRegAllocPass(registry);
+  initializeTransformPrivMemPass(registry);
+  initializeGenXFunctionPointersLoweringPass(registry);
+
+  // WRITE HERE MORE PASSES IF IT'S NEEDED;
+}
+
+TargetTransformInfo GenXTargetMachine::getTargetTransformInfo(const Function &F) {
+  GenXTTIImpl GTTI(F.getParent()->getDataLayout());
+  return TargetTransformInfo(GTTI);
+}
+
+} // namespace llvm
+
+GenXTargetMachine::GenXTargetMachine(const Target &T, const Triple &TT,
+                                     StringRef CPU, StringRef FS,
+                                     const TargetOptions &Options,
+                                     Optional<Reloc::Model> RM,
+                                     Optional<CodeModel::Model> CM,
+                                     CodeGenOpt::Level OL, bool Is64Bit)
+    : IGCLLVM::TargetMachine(T, getDL(Is64Bit), TT, CPU, FS, Options),
+      Is64Bit(Is64Bit), Subtarget(TT, CPU, FS) {}
+
+GenXTargetMachine::~GenXTargetMachine() = default;
+
+void GenXTargetMachine32::anchor() {}
+
+GenXTargetMachine32::GenXTargetMachine32(const Target &T, const Triple &TT,
+                                         StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
+                                         Optional<Reloc::Model> RM,
+                                         Optional<CodeModel::Model> CM,
+                                         CodeGenOpt::Level OL, bool JIT)
+    : GenXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void GenXTargetMachine64::anchor() {}
+
+GenXTargetMachine64::GenXTargetMachine64(const Target &T, const Triple &TT,
+                                         StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
+                                         Optional<Reloc::Model> RM,
+                                         Optional<CodeModel::Model> CM,
+                                         CodeGenOpt::Level OL, bool JIT)
+    : GenXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+extern "C" void LLVMInitializeGenXTarget() {
+  // Register the target.
+  RegisterTargetMachine<GenXTargetMachine32> X(getTheGenXTarget32());
+  RegisterTargetMachine<GenXTargetMachine64> Y(getTheGenXTarget64());
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool GenXTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
+                                            raw_pwrite_stream &o,
+                                            raw_pwrite_stream * pi,
+                                            CodeGenFileType FileType,
+                                            bool DisableVerify,
+                                            MachineModuleInfo *) {
+  // We can consider the .isa file to be an object file, or an assembly file
+  // which may later be converted to GenX code by the Finalizer. If we're
+  // asked to produce any other type of file return true to indicate an error.
+  if ((FileType != IGCLLVM::TargetMachine::CodeGenFileType::CGFT_ObjectFile) &&
+      (FileType != IGCLLVM::TargetMachine::CodeGenFileType::CGFT_AssemblyFile))
+    return true;
+
+  // GenXSubtargetPass is a wrapper pass to query features or options.
+  // This adds it explicitly to allow passes access the subtarget object using
+  // method getAnalysisIfAvailable.
+  PM.add(createGenXSubtargetPass(Subtarget));
+
+  // Wrapper structure for collecting information related to OCL runtime.
+  // Can be used by external caller by adding extractor pass in the end
+  // of compilation pipeline.
+  if (Subtarget.isOCLRuntime())
+    PM.add(new GenXOCLRuntimeInfo());
+
+  // Install GenX-specific TargetTransformInfo for passes such as
+  // LowerAggrCopies and InfoAddressSpace
+  PM.add(createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));
+
+  PM.add(createSROAPass());
+  PM.add(createEarlyCSEPass());
+  PM.add(createCFGSimplificationPass());
+  PM.add(createInstructionCombiningPass());
+
+  PM.add(createGlobalDCEPass());
+  PM.add(createGenXLowerAggrCopiesPass());
+  PM.add(createInferAddressSpacesPass());
+  PM.add(createTransformPrivMemPass());
+  PM.add(createPromoteMemoryToRegisterPass());
+    // All passes which modify the LLVM IR are now complete; run the verifier
+  // to ensure that the IR is valid.
+  if (!DisableVerify)
+    PM.add(createVerifierPass());
+  // Run passes to generate vISA.
+
+  /// .. include:: GenXGEPLowering.cpp
+  PM.add(createGenXGEPLoweringPass());
+  PM.add(createGenXThreadPrivateMemoryPass());
+
+  /// BasicAliasAnalysis
+  /// ------------------
+  /// This is a standard LLVM analysis pass to provide basic AliasAnalysis
+  /// support.
+  PM.add(createBasicAAWrapperPass());
+  /// SROA
+  /// ----
+  /// This is a standard LLVM pass, used at this point in the GenX backend.
+  /// Normally all alloca variables have been
+  /// removed by now by earlier LLVM passes, unless ``-O0`` was specified.
+  /// We run this pass here to cover that case.
+  ///
+  /// **IR restriction**: alloca, load, store not supported after this pass.
+  ///
+  PM.add(createSROAPass());
+
+  /// .. include:: GenXSwitchFixup.cpp
+  PM.add(createGenXInstCombineCleanup());
+
+  /// LowerSwitch
+  /// -----------
+  /// This is a standard LLVM pass to lower a switch instruction to a chain of
+  /// conditional branches.
+  ///
+  /// **IR restriction**: switch not supported after this pass.
+  ///
+  // TODO: keep some switch instructions and lower them to JMPSWITCH vISA ops.
+  PM.add(createLowerSwitchPass());
+  /// .. include:: GenXCFSimplification.cpp
+  PM.add(createGenXCFSimplificationPass());
+  /// CFGSimplification
+  /// -----------------
+  /// This is a standard LLVM pass, used at this point in the GenX backend.
+  ///
+  PM.add(createCFGSimplificationPass());
+  /// .. include:: GenXInlineAsmLowering.cpp
+  PM.add(createGenXInlineAsmLoweringPass());
+  /// .. include:: GenXReduceIntSize.cpp
+  PM.add(createGenXReduceIntSizePass());
+  /// .. include:: GenXAggregatePseudoLowering.cpp
+  PM.add(createGenXAggregatePseudoLoweringPass());
+  /// InstructionCombining
+  /// --------------------
+  /// This is a standard LLVM pass, used at this point in the GenX backend.
+  ///
+  PM.add(createInstructionCombiningPass());
+  // Run integer reduction again to revert some trunc/ext patterns transformed
+  // by instcombine.
+  PM.add(createGenXReduceIntSizePass());
+  /// .. include:: GenXSimdCFConformance.cpp
+  PM.add(createGenXEarlySimdCFConformancePass());
+  /// .. include:: GenXPromotePredicate.cpp
+  PM.add(createGenXPromotePredicatePass());
+  // Run GEP lowering again to remove possible GEPs after instcombine.
+  PM.add(createGenXGEPLoweringPass());
+  /// .. include:: GenXLowering.cpp
+  PM.add(createGenXLoweringPass());
+  if (!DisableVerify) PM.add(createVerifierPass());
+  PM.add(createGenXFunctionPointersLoweringPass());
+  /// .. include:: GenXRegionCollapsing.cpp
+  PM.add(createGenXRegionCollapsingPass());
+  /// EarlyCSE
+  /// --------
+  /// This is a standard LLVM pass, run at this point in the GenX backend.
+  /// It commons up common subexpressions, but only in the case that two common
+  /// subexpressions are related by one dominating the other.
+  ///
+  PM.add(createEarlyCSEPass());
+  /// BreakCriticalEdges
+  /// ------------------
+  /// In the control flow graph, a critical edge is one from a basic block with
+  /// multiple successors (a conditional branch) to a basic block with multiple
+  /// predecessors.
+  ///
+  /// We use this standard LLVM pass to split such edges, to ensure that
+  /// constant loader and GenXCoalescing have somewhere to insert a phi copy if
+  /// needed.
+  ///
+  PM.add(createBreakCriticalEdgesPass());
+  /// .. include:: GenXPatternMatch.cpp
+  PM.add(createGenXPatternMatchPass(&Options));
+  if (!DisableVerify) PM.add(createVerifierPass());
+  /// .. include:: GenXExtractVectorizer.cpp
+  PM.add(createGenXExtractVectorizerPass());
+  /// .. include:: GenXRawSendRipper.cpp
+  PM.add(createGenXRawSendRipperPass());
+  /// DeadCodeElimination
+  /// -------------------
+  /// This is a standard LLVM pass, run at this point in the GenX backend. It
+  /// removes code that has been made dead by other passes.
+  ///
+  PM.add(createDeadCodeEliminationPass());
+  /// .. include:: GenXBaling.h
+  PM.add(createGenXFuncBalingPass(BalingKind::BK_Legalization, &Subtarget));
+  /// .. include:: GenXLegalization.cpp
+  PM.add(createGenXLegalizationPass());
+  /// .. include:: GenXEmulate.cpp
+  PM.add(createGenXEmulatePass());
+  /// .. include:: GenXDeadVectorRemoval.cpp
+  PM.add(createGenXDeadVectorRemovalPass());
+  /// DeadCodeElimination
+  /// -------------------
+  /// This is a standard LLVM pass, run at this point in the GenX backend. It
+  /// removes code that has been made dead by other passes.
+  ///
+  PM.add(createDeadCodeEliminationPass());
+  /// .. include:: GenXPostLegalization.cpp
+  /// .. include:: GenXConstants.cpp
+  /// .. include:: GenXVectorDecomposer.h
+  PM.add(createGenXPostLegalizationPass());
+  if (!DisableVerify) PM.add(createVerifierPass());
+  /// EarlyCSE
+  /// --------
+  /// This is a standard LLVM pass, run at this point in the GenX backend.
+  /// It commons up common subexpressions, but only in the case that two common
+  /// subexpressions are related by one dominating the other.
+  ///
+  PM.add(createEarlyCSEPass());
+  /// LICM
+  /// ----
+  /// This is a standard LLVM pass to hoist/sink the loop invariant code after
+  /// legalization.
+  PM.add(createLICMPass());
+  /// DeadCodeElimination
+  /// -------------------
+  /// This is a standard LLVM pass, run at this point in the GenX backend. It
+  /// removes code that has been made dead by other passes.
+  ///
+  PM.add(createDeadCodeEliminationPass());
+  PM.add(createGenXIMadPostLegalizationPass());
+  /// GlobalDCE
+  /// ---------
+  /// This is a standard LLVM pass, run at this point in the GenX backend. It
+  /// eliminates unreachable internal globals.
+  ///
+  PM.add(createGlobalDCEPass());
+  /// .. include:: FunctionGroup.h
+  /// .. include:: GenXModule.h
+  PM.add(createGenXModulePass());
+  /// .. include:: GenXLiveness.h
+  PM.add(createGenXLivenessPass());
+  PM.add(createGenXGroupBalingPass(BalingKind::BK_Analysis, &Subtarget));
+  PM.add(createGenXNumberingPass());
+  PM.add(createGenXLiveRangesPass());
+  /// .. include:: GenXRematerialization.cpp
+  PM.add(createGenXRematerializationPass());
+  /// .. include:: GenXCategory.cpp
+  PM.add(createGenXCategoryPass());
+  /// Late SIMD CF conformance pass
+  /// -----------------------------
+  /// This is the same pass as GenXSimdCFConformance above, but run in a
+  /// slightly different way. See above.
+  ///
+  /// **IR restriction**: After this pass, the EM values must have EM register
+  /// category. The RM values must have RM register category. The !any result of
+  /// a goto/join must have NONE register category.
+  ///
+  PM.add(createGenXLateSimdCFConformancePass());
+  /// CodeGen baling pass
+  /// -------------------
+  /// This is the same pass as GenXBaling above, but run in a slightly different
+  /// way. See above.
+  ///
+  /// **IR restriction**: Any pass after this needs to be careful when modifying
+  /// code, as it also needs to update baling info.
+  ///
+  PM.add(createGenXGroupBalingPass(BalingKind::BK_CodeGen, &Subtarget));
+
+  /// .. include:: GenXNumbering.h
+  PM.add(createGenXNumberingPass());
+  /// .. include:: GenXLiveRanges.cpp
+  PM.add(createGenXLiveRangesPass());
+  /// .. include:: GenXUnbaling.cpp
+  PM.add(createGenXUnbalingPass());
+  /// .. include:: GenXDepressurizer.cpp
+  PM.add(createGenXDepressurizerPass());
+  /// .. include:: GenXNumbering.h
+  PM.add(createGenXNumberingPass());
+  /// .. include:: GenXLiveRanges.cpp
+  PM.add(createGenXLiveRangesPass());
+  /// .. include:: GenXCoalescing.cpp
+  PM.add(createGenXCoalescingPass());
+  /// .. include:: GenXAddressCommoning.cpp
+  PM.add(createGenXAddressCommoningPass());
+  /// .. include:: GenXArgIndirection.cpp
+  PM.add(createGenXArgIndirectionPass());
+  /// .. include:: GenXTidyControlFlow.cpp
+  //initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+  PM.add(createGenXTidyControlFlowPass());
+  /// .. include:: GenXVisaRegAlloc.h
+  auto RegAlloc = createGenXVisaRegAllocPass();
+  PM.add(RegAlloc);
+  if (DumpRegAlloc || Subtarget.dumpRegAlloc())
+    PM.add(createGenXGroupAnalysisDumperPass(RegAlloc, ".regalloc"));
+
+  /// .. include:: GenXCisaBuilder.cpp
+  PM.add(createGenXCisaBuilderPass());
+  PM.add(createGenXFinalizerPass(o));
+
+  return false;
+}
+
+void GenXTargetMachine::adjustPassManager(PassManagerBuilder &PMBuilder) {
+  // Lower aggr copies.
+  PMBuilder.addExtension(
+      PassManagerBuilder::EP_EarlyAsPossible,
+      [](const PassManagerBuilder &Builder, PassManagerBase &PM) {
+        PM.add(createGenXLowerAggrCopiesPass());
+      });
+
+  // Packetize.
+  auto AddPacketize = [](const PassManagerBuilder &Builder,
+                         PassManagerBase &PM) {
+    PM.add(createGenXPacketizePass());
+    PM.add(createAlwaysInlinerLegacyPass());
+    PM.add(createGlobalDCEPass());
+    PM.add(createPromoteMemoryToRegisterPass());
+    PM.add(createInferAddressSpacesPass());
+    PM.add(createEarlyCSEPass(true));
+    PM.add(createCFGSimplificationPass());
+    PM.add(createInstructionCombiningPass());
+    PM.add(createDeadCodeEliminationPass());
+    PM.add(createSROAPass());
+    PM.add(createInferAddressSpacesPass());
+    PM.add(createEarlyCSEPass(true));
+    PM.add(createCFGSimplificationPass());
+    PM.add(createInstructionCombiningPass());
+    PM.add(createDeadCodeEliminationPass());
+  };
+  PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly,
+                         AddPacketize);
+  PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+                         AddPacketize);
+
+  // vldst.
+  if (EmitVLoadStore) {
+    auto AddLowerLoadStore = [](const PassManagerBuilder &Builder,
+                                PassManagerBase &PM) {
+      if (Builder.OptLevel > 0) {
+        // Inline
+        PM.add(createSROAPass());
+        PM.add(createEarlyCSEPass());
+        PM.add(createJumpThreadingPass());
+        PM.add(createCFGSimplificationPass());
+        PM.add(createCorrelatedValuePropagationPass());
+        PM.add(createGenXReduceIntSizePass());
+        PM.add(createInstructionCombiningPass());
+        PM.add(createAlwaysInlinerLegacyPass());
+        PM.add(createGlobalDCEPass());
+        PM.add(createInstructionCombiningPass());
+        // Unroll
+        PM.add(createCFGSimplificationPass());
+        PM.add(createReassociatePass());
+        PM.add(createLoopRotatePass());
+        PM.add(createLICMPass());
+        PM.add(createInstructionCombiningPass());
+        PM.add(createIndVarSimplifyPass());
+        PM.add(createLoopIdiomPass());
+        PM.add(createLoopDeletionPass());
+        PM.add(createSimpleLoopUnrollPass());
+        PM.add(createInstructionCombiningPass());
+        // Simplify region accesses.
+        PM.add(createGenXRegionCollapsingPass());
+        PM.add(createEarlyCSEPass());
+        PM.add(createDeadCodeEliminationPass());
+      }
+      PM.add(createCMLowerVLoadVStorePass());
+    };
+    PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly,
+                           AddLowerLoadStore);
+    PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+                           AddLowerLoadStore);
+  }
+
+  // CM implicit parameters.
+  auto AddCMImpParam = [this](const PassManagerBuilder &Builder,
+                              PassManagerBase &PM) {
+    PM.add(createCMImpParamPass(!Subtarget.isOCLRuntime()));
+  };
+  PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly,
+                         AddCMImpParam);
+  PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+                         AddCMImpParam);
+
+  // CM ABI.
+  auto AddCMABI = [](const PassManagerBuilder &Builder, PassManagerBase &PM) {
+    PM.add(createIPSCCPPass());
+    PM.add(createCMABIPass());
+  };
+  PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly, AddCMABI);
+  PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0, AddCMABI);
+
+  // CM kernel argument offset.
+  auto AddCMKernelArgOffset = [this](const PassManagerBuilder &Builder,
+                                     PassManagerBase &PM) {
+    unsigned Width = 32;
+    PM.add(createCMKernelArgOffsetPass(Width, Subtarget.isOCLRuntime()));
+  };
+  PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly,
+                         AddCMKernelArgOffset);
+  PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+                         AddCMKernelArgOffset);
+
+  auto AddGenXPeephole = [](const PassManagerBuilder &Builder,
+                            PassManagerBase &PM) {
+    PM.add(createGenXSimplifyPass());
+  };
+  PMBuilder.addExtension(PassManagerBuilder::EP_Peephole, AddGenXPeephole);
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.h
new file mode 100644
index 000000000000..da752c93f61e
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.h
@@ -0,0 +1,183 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file declares the TargetMachine that is used by the GenX backend.
+//
+// Unlike a normal CPU backend, the GenX backend does not use CodeGen (the
+// LLVM target independent code generator).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GENXTARGETMACHINE_H
+#define GENXTARGETMACHINE_H
+
+#include "llvmWrapper/Target/TargetMachine.h"
+
+#include "GenXIntrinsics.h"
+#include "GenXSubtarget.h"
+#include "TargetInfo/GenXTargetInfo.h"
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+
+class raw_pwrite_stream;
+class MachineModuleInfo;
+
+class GenXTargetMachine : public IGCLLVM::TargetMachine {
+  bool Is64Bit;
+  GenXSubtarget Subtarget;
+
+public:
+  GenXTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                    StringRef FS, const TargetOptions &Options,
+                    Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                    CodeGenOpt::Level OL, bool Is64Bit);
+
+  ~GenXTargetMachine() override;
+
+  bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &o, raw_pwrite_stream *pi,
+                           CodeGenFileType FileType,
+                           bool /*DisableVerify*/ = true,
+                           MachineModuleInfo *MMI = nullptr) override;
+
+  void adjustPassManager(PassManagerBuilder &PMBuilder) override;
+
+  virtual const DataLayout *getDataLayout() const { return &DL; }
+
+  virtual const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override {
+    return &Subtarget;
+  }
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+};
+
+class GenXTargetMachine32 : public GenXTargetMachine {
+  virtual void anchor();
+
+public:
+  GenXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU,
+                      StringRef FS, const TargetOptions &Options,
+                      Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                      CodeGenOpt::Level OL, bool JIT);
+};
+
+class GenXTargetMachine64 : public GenXTargetMachine {
+  virtual void anchor();
+
+public:
+  GenXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU,
+                      StringRef FS, const TargetOptions &Options,
+                      Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                      CodeGenOpt::Level OL, bool JIT);
+};
+
+// This implementation allows us to define our own costs for
+// the GenX backend. Did not use BasicTTIImplBase because the overloaded
+// constructors have TragetMachine as an argument, so I inherited from
+// its parent which has only DL as its arguments
+class GenXTTIImpl : public TargetTransformInfoImplCRTPBase<GenXTTIImpl>
+{
+  typedef TargetTransformInfoImplCRTPBase<GenXTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+public:
+  GenXTTIImpl(const DataLayout& DL) : BaseT(DL) {}
+
+  bool shouldBuildLookupTables() { return false; }
+  unsigned getFlatAddressSpace() { return 4; }
+
+  int getUserCost(const User *U, ArrayRef<const Value *> Operands) {
+    if (auto EV = dyn_cast<ExtractValueInst>(U)) {
+      switch(GenXIntrinsic::getGenXIntrinsicID(EV->getOperand(0))) {
+        case GenXIntrinsic::genx_simdcf_goto:
+        case GenXIntrinsic::genx_simdcf_join:
+          // Do not allow such EVs to be TCC_Free
+          return TTI::TCC_Basic;
+        default:
+          break;
+      }
+    }
+
+    return BaseT::getUserCost(U, Operands);
+  }
+
+  bool isProfitableToHoist(Instruction *I) const {
+    // genx_vload and genx_vstore are related to g_store bales
+    // and they shouldn't be hoisted from then/else blocks
+    // in front of the branch
+    auto IntrinsicID = GenXIntrinsic::getGenXIntrinsicID(I);
+    return IntrinsicID != GenXIntrinsic::genx_vload &&
+           IntrinsicID != GenXIntrinsic::genx_vstore;
+  }
+};
+
+/// Initialize all GenX passes for opt tool.
+void initializeGenXPasses(PassRegistry &);
+
+void initializeFunctionGroupAnalysisPass(PassRegistry &);
+void initializeGenXAddressCommoningPass(PassRegistry &);
+void initializeGenXArgIndirectionPass(PassRegistry &);
+void initializeGenXCategoryPass(PassRegistry &);
+void initializeGenXCFSimplificationPass(PassRegistry &);
+void initializeGenXCisaBuilderPass(PassRegistry &);
+void initializeGenXCoalescingPass(PassRegistry &);
+void initializeGenXDeadVectorRemovalPass(PassRegistry &);
+void initializeGenXDepressurizerPass(PassRegistry &);
+void initializeGenXEarlySimdCFConformancePass(PassRegistry &);
+void initializeGenXEmulatePass(PassRegistry &);
+void initializeGenXExtractVectorizerPass(PassRegistry &);
+void initializeGenXFuncBalingPass(PassRegistry &);
+void initializeGenXGEPLoweringPass(PassRegistry &);
+void initializeGenXGroupBalingPass(PassRegistry &);
+void initializeGenXInstCombineCleanup(PassRegistry &);
+void initializeGenXIMadPostLegalizationPass(PassRegistry &);
+void initializeGenXLateSimdCFConformancePass(PassRegistry &);
+void initializeGenXLayoutBlocksPass(PassRegistry &);
+void initializeGenXLegalizationPass(PassRegistry &);
+void initializeGenXLiveRangesPass(PassRegistry &);
+void initializeGenXLivenessPass(PassRegistry &);
+void initializeGenXLowerAggrCopiesPass(PassRegistry &);
+void initializeGenXLoweringPass(PassRegistry &);
+void initializeGenXModulePass(PassRegistry &);
+void initializeGenXNumberingPass(PassRegistry &);
+void initializeGenXPatternMatchPass(PassRegistry &);
+void initializeGenXPostLegalizationPass(PassRegistry &);
+void initializeGenXPostLegalizationPass(PassRegistry &);
+void initializeGenXPromotePredicatePass(PassRegistry &);
+void initializeGenXRawSendRipperPass(PassRegistry &);
+void initializeGenXReduceIntSizePass(PassRegistry &);
+void initializeGenXRegionCollapsingPass(PassRegistry &);
+void initializeGenXRematerializationPass(PassRegistry &);
+void initializeGenXSubtargetPassPass(PassRegistry &);
+void initializeGenXThreadPrivateMemoryPass(PassRegistry &);
+void initializeGenXUnbalingPass(PassRegistry &);
+void initializeGenXVisaRegAllocPass(PassRegistry &);
+void initializeTransformPrivMemPass(PassRegistry &);
+void initializeGenXFunctionPointersLoweringPass(PassRegistry &);
+} // End llvm namespace
+
+#endif
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXThreadPrivateMemory.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXThreadPrivateMemory.cpp
new file mode 100644
index 000000000000..4483645f0fdc
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXThreadPrivateMemory.cpp
@@ -0,0 +1,1023 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// This pass lowers alloca instructions to genx.alloca intrinsics and changes
+/// pointer from alloca to offset in predefined stack surface
+//
+//===----------------------------------------------------------------------===//
+
+#include "GenX.h"
+#include "GenXModule.h"
+#include "GenXRegion.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "GenXVisa.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvmWrapper/IR/InstrTypes.h"
+#include <queue>
+#include <utility>
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+// This actually should've been a FunctionGroupPass,
+// but due to the FGPassManager hack we can't run GenXModule twice
+// so for now we can't insert module pass that invalidate FGA betw FGPasses
+class GenXThreadPrivateMemory : public ModulePass,
+                                public InstVisitor<GenXThreadPrivateMemory> {
+public:
+  GenXThreadPrivateMemory();
+
+  virtual StringRef getPassName() const override {
+    return "GenXThreadPrivateMemory";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    ModulePass::getAnalysisUsage(AU);
+    AU.setPreservesCFG();
+  }
+
+  bool runOnModule(Module &M) override;
+  bool runOnFunction(Function &F);
+
+  void visitAllocaInst(AllocaInst &I);
+
+private:
+  bool replacePhi(PHINode *Phi);
+  bool preparePhiForReplacement(PHINode *Phi);
+  bool replaceScatterPrivate(CallInst *CI);
+  bool replaceGatherPrivate(CallInst *CI);
+  bool replacePTI(PtrToIntInst *PTI);
+  bool replaceStore(StoreInst *StI);
+  bool replaceLoad(LoadInst *LdI);
+  bool replaceSelect(SelectInst *Sel);
+  bool replaceAddrSpaceCast(AddrSpaceCastInst * AddrCast);
+  Value *lookForPtrReplacement(Value *Ptr) const;
+  void addUsers(Instruction *I);
+  void collectEachAllocaUsers();
+  void addUsersIfNeeded(Instruction *I);
+  std::pair<Value *, unsigned> NormalizeVector(Value *From, Type *To,
+                                               Instruction *InsertBefore);
+  Instruction *RestoreVectorAfterNormalization(Instruction *From, Type *To);
+
+public:
+  static char ID;
+
+private:
+  LLVMContext *m_ctx;
+  GenXSubtarget *m_ST;
+  const DataLayout *m_DL;
+  std::vector<AllocaInst *> m_alloca;
+  std::vector<CallInst *> m_gather;
+  std::vector<CallInst *> m_scatter;
+  std::map<AllocaInst *, CallInst *> m_allocaToIntrinsic;
+  std::queue<Instruction *> m_AIUsers;
+  std::set<Instruction *> m_AlreadyAdded;
+  PreDefined_Surface m_stack;
+};
+} // namespace
+
+// Register pass to igc-opt
+namespace llvm {
+void initializeGenXThreadPrivateMemoryPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(GenXThreadPrivateMemory, "GenXThreadPrivateMemory",
+                      "GenXThreadPrivateMemory", false, false)
+INITIALIZE_PASS_END(GenXThreadPrivateMemory, "GenXThreadPrivateMemory",
+                    "GenXThreadPrivateMemory", false, false)
+
+char GenXThreadPrivateMemory::ID = 0;
+
+ModulePass *llvm::createGenXThreadPrivateMemoryPass() {
+  return new GenXThreadPrivateMemory;
+}
+
+GenXThreadPrivateMemory::GenXThreadPrivateMemory() : ModulePass(ID) {
+  initializeGenXThreadPrivateMemoryPass(*PassRegistry::getPassRegistry());
+}
+
+static Value *ZExtOrTruncIfNeeded(Value *From, Type *To,
+                                  Instruction *InsertBefore) {
+  unsigned FromTySz = From->getType()->getPrimitiveSizeInBits();
+  unsigned ToTySz = To->getPrimitiveSizeInBits();
+  Value *Res = From;
+  if (From->getType()->isVectorTy() &&
+      From->getType()->getVectorNumElements() == 1) {
+    Res = CastInst::CreateBitOrPointerCast(
+        Res, From->getType()->getVectorElementType(), "", InsertBefore);
+  }
+  if (FromTySz < ToTySz)
+    Res = CastInst::CreateZExtOrBitCast(Res, To, "", InsertBefore);
+  else if (FromTySz > ToTySz)
+    Res = CastInst::CreateTruncOrBitCast(Res, To, "", InsertBefore);
+  return Res;
+}
+
+// If data is a vector of double/int64, bitcast each element to 2 int32.
+// If data is a vector of type < 32bit, extend each element in order to create
+// proper send instruction in the finalizer.
+std::pair<Value *, unsigned>
+GenXThreadPrivateMemory::NormalizeVector(Value *From, Type *To,
+                                         Instruction *InsertBefore) {
+  Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext());
+  Value *Res = From;
+  Type *FromTy = From->getType();
+  assert(isa<VectorType>(FromTy));
+  unsigned NumElts = FromTy->getVectorNumElements();
+  unsigned EltSz =
+      m_DL->getTypeSizeInBits(FromTy->getScalarType()) / genx::ByteBits;
+  assert(EltSz > 0);
+  if (To->getScalarType()->isPointerTy() &&
+      To->getScalarType()->getPointerElementType()->isFunctionTy()) {
+    Type *I64Ty = Type::getInt64Ty(InsertBefore->getContext());
+    To = VectorType::get(I64Ty, NumElts);
+    Res = CastInst::Create(Instruction::PtrToInt, From, To, "", InsertBefore);
+    NumElts *= 2;
+    To = VectorType::get(I32Ty, NumElts);
+    EltSz = I32Ty->getPrimitiveSizeInBits() / genx::ByteBits;
+    Res = CastInst::Create(Instruction::BitCast, Res, To, "", InsertBefore);
+  } else if (To->getVectorElementType()->getPrimitiveSizeInBits() <
+             genx::DWordBits) {
+    To = VectorType::get(I32Ty, NumElts);
+
+    Res = CastInst::Create(Instruction::ZExt, From, To, "", InsertBefore);
+  } else if (To->getVectorElementType()->getPrimitiveSizeInBits() ==
+             genx::QWordBits) {
+    NumElts *= 2;
+    EltSz = I32Ty->getPrimitiveSizeInBits() / genx::ByteBits;
+    To = VectorType::get(I32Ty, NumElts);
+
+    Res = CastInst::Create(Instruction::BitCast, From, To, "", InsertBefore);
+  }
+
+  return std::make_pair(Res, EltSz);
+}
+
+Instruction *
+GenXThreadPrivateMemory::RestoreVectorAfterNormalization(Instruction *From,
+                                                         Type *To) {
+  Instruction *Restored = From;
+  unsigned EltSz = m_DL->getTypeSizeInBits(To->getScalarType());
+  assert(EltSz > 0);
+  if (To->getScalarType()->isPointerTy() &&
+      To->getScalarType()->getPointerElementType()->isFunctionTy()) {
+    Restored = PtrToIntInst::Create(Instruction::IntToPtr, From, To);
+  } else if (EltSz < genx::DWordBits) {
+    Restored = CastInst::Create(Instruction::Trunc, From, To, "");
+  } else if (EltSz == genx::QWordBits) {
+    auto *NewFrom = From;
+    if (!From->getType()->getScalarType()->isPointerTy() &&
+        To->getScalarType()->isPointerTy()) {
+      assert(From->getType()->getScalarType()->isIntegerTy(genx::DWordBits));
+      Type *NewTy =
+          VectorType::get(Type::getInt64Ty(*m_ctx),
+                          From->getType()->getVectorNumElements() / 2);
+      NewFrom = CastInst::CreateBitOrPointerCast(From, NewTy);
+      NewFrom->insertAfter(From);
+      Restored = CastInst::Create(CastInst::IntToPtr, NewFrom, To);
+    } else
+      Restored = CastInst::CreateBitOrPointerCast(NewFrom, To);
+  }
+  if (Restored != From)
+    Restored->insertAfter(From);
+  return Restored;
+}
+
+static Value *DoubleVector(Value *OrigVector, unsigned ShiftVal,
+                           Instruction *InsertPoint) {
+  IRBuilder<> Builder(InsertPoint);
+  Type *I32Ty = Type::getInt32Ty(InsertPoint->getContext());
+  unsigned NumElts = OrigVector->getType()->getVectorNumElements() * 2;
+  Type *OrigVectorEltTy = OrigVector->getType()->getVectorElementType();
+  Value *NewElts = UndefValue::get(VectorType::get(OrigVectorEltTy, NumElts));
+  for (unsigned CurEltNum = 0; CurEltNum * 2 < NumElts; ++CurEltNum) {
+    Value *OldIdx = ConstantInt::get(I32Ty, CurEltNum);
+    Value *NewIdx = ConstantInt::get(I32Ty, CurEltNum * 2);
+    Value *EltOld = Builder.CreateExtractElement(OrigVector, OldIdx);
+    NewElts = Builder.CreateInsertElement(NewElts, EltOld, NewIdx);
+    NewIdx = ConstantInt::get(I32Ty, CurEltNum * 2 + 1);
+    if (ShiftVal) {
+      Value *TyShift = ConstantInt::get(I32Ty, ShiftVal);
+      EltOld = Builder.CreateAdd(EltOld, TyShift);
+    }
+    NewElts = Builder.CreateInsertElement(NewElts, EltOld, NewIdx);
+  }
+
+  return NewElts;
+}
+
+static Value *FormEltsOffsetVector(unsigned NumElts, unsigned TySz,
+                                   Instruction *InsertBefore) {
+  IRBuilder<> Builder(InsertBefore);
+  Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext());
+  Value *EltsOffset = UndefValue::get(VectorType::get(I32Ty, NumElts));
+  for (unsigned CurElt = 0; CurElt < NumElts; ++CurElt) {
+    Value *Idx = ConstantInt::get(I32Ty, CurElt);
+    Value *EltOffset = ConstantInt::get(I32Ty, CurElt * TySz);
+    EltsOffset = Builder.CreateInsertElement(EltsOffset, EltOffset, Idx);
+  }
+
+  return EltsOffset;
+}
+
+static Value *FormEltsOffsetVectorForSVM(unsigned NumElts,
+                                   Instruction *InsertBefore, Value *Offset) {
+  IRBuilder<> Builder(InsertBefore);
+  Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext());
+  Type *I64Ty = Type::getInt64Ty(InsertBefore->getContext());
+  Value *EltsOffset = UndefValue::get(VectorType::get(I64Ty, NumElts));
+  if (Offset->getType()->isVectorTy()) {
+    assert(Offset->getType()->getVectorNumElements() == 1);
+    Offset = CastInst::CreateZExtOrBitCast(Offset, I64Ty, "", InsertBefore);
+  }
+  for (unsigned CurElt = 0; CurElt < NumElts; ++CurElt) {
+    Value *Idx = ConstantInt::get(I32Ty, CurElt);
+    EltsOffset = Builder.CreateInsertElement(EltsOffset, Offset, Idx);
+  }
+
+  return EltsOffset;
+}
+
+
+Value *GenXThreadPrivateMemory::lookForPtrReplacement(Value *Ptr) const {
+  assert(Ptr->getType()->isPtrOrPtrVectorTy());
+
+  if (auto BC = dyn_cast<BitCastInst>(Ptr))
+    return lookForPtrReplacement(BC->getOperand(0));
+  else if (auto ITP = dyn_cast<IntToPtrInst>(Ptr))
+    return ITP->getOperand(0);
+  else if (auto AI = dyn_cast<AllocaInst>(Ptr)) {
+    auto AllocaIntr = m_allocaToIntrinsic.find(AI);
+    assert(AllocaIntr != m_allocaToIntrinsic.end() &&
+           "Each alloca must be here");
+    return AllocaIntr->second;
+  } else if (auto *EEI = dyn_cast<ExtractElementInst>(Ptr)) {
+    // support a case when load/gather addr goes from svm.ld + extract_elem
+    auto *CI = dyn_cast<IGCLLVM::CallInst>(EEI->getVectorOperand());
+    if (CI && !CI->isIndirectCall() &&
+        GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+            GenXIntrinsic::genx_svm_block_ld) {
+      if (Ptr->getType()->isPointerTy()) {
+        auto *Cast = CastInst::Create(Instruction::PtrToInt, Ptr,
+                                      Type::getInt32Ty(*m_ctx));
+        Cast->insertAfter(EEI);
+        return Cast;
+      } else
+        return Ptr;
+    } else
+      report_fatal_error("Cannot find pointer replacement for extractelem");
+  } else if (isa<ConstantPointerNull>(Ptr))
+    return ConstantInt::get(Type::getInt32Ty(*m_ctx), 0);
+  else
+    report_fatal_error("Cannot find pointer replacement");
+}
+
+bool GenXThreadPrivateMemory::replaceAddrSpaceCast(
+  AddrSpaceCastInst* AddrCast) {
+  auto NewAlloca = lookForPtrReplacement(AddrCast->getPointerOperand());
+
+  auto IntToPtr = IntToPtrInst::Create(
+    llvm::Instruction::CastOps::IntToPtr, NewAlloca,
+    AddrCast->getPointerOperand()->getType(), "", AddrCast);
+  auto NewAddrCast =
+    AddrSpaceCastInst::Create(llvm::Instruction::CastOps::AddrSpaceCast,
+    IntToPtr, AddrCast->getType(), "", AddrCast);
+
+  AddrCast->replaceAllUsesWith(NewAddrCast);
+  AddrCast->eraseFromParent();
+
+  return true;
+}
+
+bool GenXThreadPrivateMemory::replaceLoad(LoadInst *LdI) {
+  IRBuilder<> Builder(LdI);
+  Type *LdTy = LdI->getType();
+  Type *LdEltTy = LdTy;
+  if (isa<VectorType>(LdEltTy))
+    LdEltTy = LdEltTy->getVectorElementType();
+  else
+    LdTy = VectorType::get(LdTy, 1);
+
+  unsigned NumEltsToLoad = LdTy->getVectorNumElements();
+  unsigned LdEltTySz = m_DL->getTypeSizeInBits(LdEltTy);
+  if (LdEltTySz == genx::QWordBits)
+    NumEltsToLoad *= 2;
+
+  Value *PredVal = ConstantInt::get(Type::getInt1Ty(*m_ctx), 1);
+  Value *Pred = Builder.CreateVectorSplat(NumEltsToLoad, PredVal);
+
+  Type *I32Ty = Type::getInt32Ty(*m_ctx);
+  Type *I64Ty = Type::getInt64Ty(*m_ctx);
+  Type *TyToLoad = I32Ty;
+  if (LdEltTy->isFloatTy())
+    TyToLoad = LdEltTy;
+  Type *RealTyToLoad = LdEltTy;
+  if (m_DL->getTypeSizeInBits(RealTyToLoad) == genx::QWordBits)
+    RealTyToLoad = I32Ty;
+  unsigned RealTyToLoadSz =
+      m_DL->getTypeSizeInBits(RealTyToLoad) / genx::ByteBits;
+  Value *OldValOfTheDataRead =
+      Builder.CreateVectorSplat(NumEltsToLoad, UndefValue::get(TyToLoad));
+
+
+  Value *PointerOp = LdI->getPointerOperand();
+  Value *Offset = lookForPtrReplacement(PointerOp);
+  Offset =
+      ZExtOrTruncIfNeeded(Offset, m_ST->useGlobalMem() ? I64Ty : I32Ty, LdI);
+  auto IID = m_ST->useGlobalMem()
+                 ? llvm::GenXIntrinsic::genx_svm_gather
+                 : llvm::GenXIntrinsic::genx_gather_scaled;
+
+  Value *EltsOffset = FormEltsOffsetVector(NumEltsToLoad, RealTyToLoadSz, LdI);
+
+  unsigned SrcSize = genx::log2(RealTyToLoadSz);
+  Value *logNumBlocks = ConstantInt::get(I32Ty, m_ST->useGlobalMem() ? 0 : SrcSize);
+  Value *Scale = ConstantInt::get(Type::getInt16Ty(*m_ctx), 0);
+  Value *Surface = ConstantInt::get(I32Ty,
+                                    visa::getReservedSurfaceIndex(m_stack));
+  if (m_ST->useGlobalMem() && NumEltsToLoad > 1) {
+    assert(Offset->getType()->getScalarType()->isIntegerTy(64));
+    auto *BaseOff = FormEltsOffsetVectorForSVM(NumEltsToLoad, LdI, Offset);
+    auto *ZextOff = CastInst::CreateZExtOrBitCast(
+        EltsOffset,
+        VectorType::get(I64Ty, EltsOffset->getType()->getVectorNumElements()),
+        "", LdI);
+    Offset = BinaryOperator::CreateAdd(BaseOff, ZextOff, "", LdI);
+  }
+  Function *F = GenXIntrinsic::getGenXDeclaration(
+      LdI->getModule(), IID,
+      {OldValOfTheDataRead->getType(),
+      Pred->getType(),
+       (m_ST->useGlobalMem() ? Offset : EltsOffset)->getType()});
+  CallInst *Gather =
+      m_ST->useGlobalMem()
+          ? IntrinsicInst::Create(
+                F, {Pred, logNumBlocks, Offset, OldValOfTheDataRead})
+          : IntrinsicInst::Create(F, {Pred, logNumBlocks, Scale, Surface,
+                                      Offset, EltsOffset, OldValOfTheDataRead});
+  Gather->insertAfter(LdI);
+  m_gather.push_back(Gather);
+  Instruction *ProperGather = RestoreVectorAfterNormalization(Gather, LdTy);
+
+  if (!isa<VectorType>(LdI->getType()) &&
+      isa<VectorType>(ProperGather->getType())) {
+    Instruction *LdVal = CastInst::CreateBitOrPointerCast(ProperGather, LdI->getType());
+    LdVal->insertAfter(ProperGather);
+    ProperGather = LdVal;
+  }
+
+  LdI->replaceAllUsesWith(ProperGather);
+  LdI->eraseFromParent();
+
+  return true;
+}
+
+bool GenXThreadPrivateMemory::replaceStore(StoreInst *StI) {
+  IRBuilder<> Builder(StI);
+  Value *ValueOp = StI->getValueOperand();
+  Type *ValueOpTy = ValueOp->getType();
+  if (ValueOpTy->isIntOrPtrTy() || ValueOpTy->isFloatingPointTy()) {
+    ValueOp = Builder.CreateVectorSplat(1, ValueOp);
+    ValueOpTy = ValueOp->getType();
+  }
+  assert(ValueOp->getType()->isVectorTy());
+
+  unsigned ValueEltSz = 0;
+  std::tie(ValueOp, ValueEltSz) = NormalizeVector(ValueOp, ValueOpTy, StI);
+  unsigned ValueNumElts = ValueOp->getType()->getVectorNumElements();
+
+  Value *PointerOp = StI->getPointerOperand();
+  Value *Offset = lookForPtrReplacement(PointerOp);
+  Type *I32Ty = Type::getInt32Ty(*m_ctx);
+  Type *I64Ty = Type::getInt64Ty(*m_ctx);
+  Offset =
+      ZExtOrTruncIfNeeded(Offset, m_ST->useGlobalMem() ? I64Ty : I32Ty, StI);
+
+  auto IID = m_ST->useGlobalMem()
+                 ? llvm::GenXIntrinsic::genx_svm_scatter
+                 : llvm::GenXIntrinsic::genx_scatter_scaled;
+
+  Value *PredVal = ConstantInt::get(Type::getInt1Ty(*m_ctx), 1);
+  Value *Pred = Builder.CreateVectorSplat(ValueNumElts, PredVal);
+  Value *EltsOffset = FormEltsOffsetVector(ValueNumElts, ValueEltSz, StI);
+
+  if (m_ST->useGlobalMem() && ValueNumElts > 1) {
+    assert(Offset->getType()->getScalarType()->isIntegerTy(64));
+    auto *BaseOff = FormEltsOffsetVectorForSVM(ValueNumElts, StI, Offset);
+    auto *ZextOff = CastInst::CreateZExtOrBitCast(
+        EltsOffset,
+        VectorType::get(I64Ty, EltsOffset->getType()->getVectorNumElements()),
+        "", StI);
+    Offset = BinaryOperator::CreateAdd(BaseOff, ZextOff, "", StI);
+  }
+
+  Function *F = GenXIntrinsic::getGenXDeclaration(
+      StI->getModule(), IID,
+      {Pred->getType(),
+       (m_ST->useGlobalMem() ? Offset : EltsOffset)->getType(),
+       ValueOp->getType()});
+  Value *logNumBlocks = ConstantInt::get(I32Ty, m_ST->useGlobalMem() ? 0 : genx::log2(ValueEltSz));
+  Value *Scale = ConstantInt::get(Type::getInt16Ty(*m_ctx), 0);
+  Value *Surface = ConstantInt::get(I32Ty,
+                                    visa::getReservedSurfaceIndex(m_stack));
+  auto *Scatter =
+      m_ST->useGlobalMem()
+          ? IntrinsicInst::Create(F, {Pred, logNumBlocks, Offset, ValueOp})
+          : IntrinsicInst::Create(F, {Pred, logNumBlocks, Scale, Surface,
+                                      Offset, EltsOffset, ValueOp});
+  Scatter->insertAfter(StI);
+  StI->eraseFromParent();
+
+  m_scatter.push_back(Scatter);
+
+  return true;
+}
+
+bool GenXThreadPrivateMemory::replacePTI(PtrToIntInst *PTI) {
+  Value *PointerOp = PTI->getPointerOperand();
+  Value *Offset = lookForPtrReplacement(PointerOp);
+
+  Offset = ZExtOrTruncIfNeeded(Offset, PTI->getDestTy(), PTI);
+  PTI->replaceAllUsesWith(Offset);
+  PTI->eraseFromParent();
+
+  return true;
+}
+
+bool GenXThreadPrivateMemory::replaceGatherPrivate(CallInst *CI) {
+  auto IID = m_ST->useGlobalMem()
+                 ? llvm::GenXIntrinsic::genx_svm_gather
+                 : llvm::GenXIntrinsic::genx_gather_scaled;
+
+  Type *OrigDstTy = CI->getType();
+  assert(isa<VectorType>(OrigDstTy));
+  Type *NewDstTy = OrigDstTy;
+  Value *OldValue = CI->getArgOperand(3);
+  unsigned ValueEltSz = 0;
+
+  // Check gather.private invariant.
+  assert(NewDstTy == OldValue->getType());
+
+  // Cast data type to legal.
+  std::tie(OldValue, ValueEltSz) = NormalizeVector(OldValue, NewDstTy, CI);
+  NewDstTy = OldValue->getType();
+  unsigned ValueNumElts = NewDstTy->getVectorNumElements();
+
+  Value *Pred = CI->getArgOperand(0);
+  Value *EltsOffset = CI->getArgOperand(2);
+  if (OrigDstTy->getVectorElementType()->getPrimitiveSizeInBits() ==
+      genx::QWordBits) {
+    assert(ValueNumElts == EltsOffset->getType()->getVectorNumElements() * 2);
+    EltsOffset = DoubleVector(EltsOffset, ValueEltSz, CI);
+    Pred = DoubleVector(Pred, 0, CI);
+  }
+
+  Type *I32Ty = Type::getInt32Ty(*m_ctx);
+  Value *PointerOp = CI->getOperand(1);
+  Value *Offset = lookForPtrReplacement(PointerOp);
+  Offset = ZExtOrTruncIfNeeded(Offset, I32Ty, CI);
+
+  Function *F = GenXIntrinsic::getGenXDeclaration(
+      CI->getModule(), IID,
+      {NewDstTy, Pred->getType(),
+       (m_ST->useGlobalMem() ? Offset : EltsOffset)->getType()});
+
+  Value *logNumBlocks = ConstantInt::get(I32Ty, genx::log2(ValueEltSz));
+  Value *Scale = ConstantInt::get(Type::getInt16Ty(*m_ctx), 0);
+  Value *Surface = ConstantInt::get(I32Ty,
+                                    visa::getReservedSurfaceIndex(m_stack));
+
+  CallInst *Gather =
+      m_ST->useGlobalMem()
+          ? IntrinsicInst::Create(F, {Pred, logNumBlocks, Offset, OldValue})
+          : IntrinsicInst::Create(F, {Pred, logNumBlocks, Scale, Surface,
+                                      Offset, EltsOffset, OldValue});
+  Gather->insertAfter(CI);
+  m_gather.push_back(Gather);
+
+  Instruction *ProperGather =
+      RestoreVectorAfterNormalization(Gather, OrigDstTy);
+  CI->replaceAllUsesWith(ProperGather);
+  CI->eraseFromParent();
+
+  return true;
+}
+
+bool GenXThreadPrivateMemory::replaceScatterPrivate(CallInst *CI) {
+  auto IID = m_ST->useGlobalMem()
+                 ? llvm::GenXIntrinsic::genx_svm_scatter
+                 : llvm::GenXIntrinsic::genx_scatter_scaled;
+  Value *ValueOp = CI->getArgOperand(3);
+  Type *OrigValueTy = ValueOp->getType();
+  assert(isa<VectorType>(OrigValueTy));
+  unsigned EltSz = 0;
+  std::tie(ValueOp, EltSz) = NormalizeVector(ValueOp, ValueOp->getType(), CI);
+
+  Value *Pred = CI->getArgOperand(0);
+  Value *EltsOffset = CI->getArgOperand(2);
+  if (OrigValueTy->getVectorElementType()->getPrimitiveSizeInBits() ==
+      genx::QWordBits) {
+    EltsOffset = DoubleVector(EltsOffset, EltSz, CI);
+    Pred = DoubleVector(Pred, 0, CI);
+  }
+
+  Value *ScatterPtr = CI->getArgOperand(1);
+  Type *I32Ty = Type::getInt32Ty(*m_ctx);
+  Value *Offset = lookForPtrReplacement(ScatterPtr);
+  Offset = ZExtOrTruncIfNeeded(Offset, I32Ty, CI);
+
+  Function *F = GenXIntrinsic::getGenXDeclaration(
+      CI->getModule(), IID,
+      {Pred->getType(), (m_ST->useGlobalMem() ? Offset : EltsOffset)->getType(),
+       ValueOp->getType()});
+
+  unsigned logNumBlocks = genx::log2(EltSz);
+  unsigned Scale = 0; // scale is always 0
+  Value *Surface = ConstantInt::get(I32Ty,
+                                    visa::getReservedSurfaceIndex(m_stack));
+  CallInst *ScatterStScaled =
+      m_ST->useGlobalMem()
+          ? IntrinsicInst::Create(
+                F,
+                {Pred, ConstantInt::get(I32Ty, logNumBlocks), Offset, ValueOp})
+          : IntrinsicInst::Create(
+                F, {Pred, ConstantInt::get(I32Ty, logNumBlocks),
+                    ConstantInt::get(Type::getInt16Ty(*m_ctx), Scale), Surface,
+                    Offset, EltsOffset, ValueOp});
+  ScatterStScaled->insertAfter(CI);
+  m_scatter.push_back(ScatterStScaled);
+  CI->replaceAllUsesWith(ScatterStScaled);
+  CI->eraseFromParent();
+
+  return true;
+}
+
+bool GenXThreadPrivateMemory::replacePhi(PHINode *Phi) {
+  SmallVector<Value *, 8> PhiOps;
+  for (auto &IncVal : Phi->incoming_values())
+    PhiOps.push_back(lookForPtrReplacement(static_cast<Value *>(IncVal.get())));
+
+  assert(!PhiOps.empty());
+
+  Type *OffsetTy = PhiOps[0]->getType();
+  auto TypeChecker = [OffsetTy](Value *V) { return OffsetTy == V->getType(); };
+  assert(std::all_of(PhiOps.begin(), PhiOps.end(), TypeChecker));
+
+  PHINode *NewPhi = PHINode::Create(OffsetTy, PhiOps.size());
+  for (unsigned i = 0; i < PhiOps.size(); ++i)
+    NewPhi->addIncoming(PhiOps[i], Phi->getIncomingBlock(i));
+
+  NewPhi->insertAfter(Phi);
+
+  // Create temporary cast instruction to satisfy old phi users. Types must be
+  // different due to replacement pointer by integer offset.
+  assert(NewPhi->getType() != Phi->getType());
+  CastInst *TempCast = CastInst::CreateBitOrPointerCast(NewPhi, Phi->getType());
+  TempCast->insertAfter(NewPhi->getParent()->getFirstNonPHI());
+
+  Phi->replaceAllUsesWith(TempCast);
+  Phi->eraseFromParent();
+
+  return true;
+}
+
+// |--%1 = PHI(%2, ...)
+// |         ^
+// |         |
+// |         |
+// |  %2 = PHI(%1, ...)
+// |---------^
+//
+// In this situation, it's difficult to find the origin of the pointer. PtrToInt
+// and IntToPtr break the process of searching (see lookForPtrReplacement) and
+// it helps to 'emulate' phi in TPM
+bool GenXThreadPrivateMemory::preparePhiForReplacement(PHINode *Phi) {
+  if (!isa<PointerType>(Phi->getType()))
+    return false;
+
+  Type *I64Ty = Type::getInt64Ty(Phi->getContext());
+  StringRef Name = Phi->getName();
+  Instruction *TempPtrToInt = CastInst::Create(
+      Instruction::PtrToInt, Phi, I64Ty, Name + ".tpm.temp.pti",
+      Phi->getParent()->getFirstNonPHI());
+  Instruction *TempIntToPtr =
+      CastInst::Create(Instruction::IntToPtr, TempPtrToInt, Phi->getType(),
+                       Name + ".tpm.temp.itp");
+  TempIntToPtr->insertAfter(TempPtrToInt);
+  Phi->replaceAllUsesWith(TempIntToPtr);
+
+  // Replacement here was incorrect
+  TempPtrToInt->replaceUsesOfWith(TempIntToPtr, Phi);
+
+  return true;
+}
+
+bool GenXThreadPrivateMemory::replaceSelect(SelectInst *Sel) {
+  Value *Cond = Sel->getCondition();
+  Value *TrueValue = lookForPtrReplacement(Sel->getTrueValue());
+  Value *FalseValue = lookForPtrReplacement(Sel->getFalseValue());
+
+  SelectInst *NewSel = SelectInst::Create(Cond, TrueValue, FalseValue);
+  NewSel->insertAfter(Sel);
+  NewSel->setDebugLoc(Sel->getDebugLoc());
+
+  CastInst *TempCast = CastInst::CreateBitOrPointerCast(NewSel, Sel->getType());
+  TempCast->insertAfter(NewSel);
+  TempCast->setDebugLoc(Sel->getDebugLoc());
+
+  Sel->replaceAllUsesWith(TempCast);
+  Sel->eraseFromParent();
+
+  return true;
+}
+
+static Value *GetUndefVec(Type *Ty, unsigned NumElts) {
+  return UndefValue::get(VectorType::get(Ty, NumElts));
+}
+
+static std::pair<Value *, Value *> GetUndefPair(Type *Ty, unsigned NumElts) {
+  return std::make_pair(GetUndefVec(Ty, NumElts), GetUndefVec(Ty, NumElts));
+}
+
+static Value *FillVecWithSeqVals(Value *Vec, unsigned Start,
+                                 Instruction *InsertBefore) {
+  IRBuilder<> Builder(InsertBefore);
+  Builder.SetInsertPoint(InsertBefore);
+
+  Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext());
+  unsigned NumElts = Vec->getType()->getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    Value *Idx = ConstantInt::get(I32Ty, i);
+    Value *Val = ConstantInt::get(I32Ty, i + Start);
+    Vec = Builder.CreateInsertElement(Vec, Val, Idx);
+  }
+  return Vec;
+}
+
+static std::pair<Value *, Value *>
+SplitVec(Value *Vec, unsigned NumElts, Instruction *InsertBefore,
+         std::pair<Value *, Value *> Splitters) {
+  IRBuilder<> Builder(InsertBefore);
+  Builder.SetInsertPoint(InsertBefore);
+
+  Type *EltTy = Vec->getType()->getVectorElementType();
+  Value *First = Builder.CreateShuffleVector(Vec, GetUndefVec(EltTy, NumElts),
+                                             Splitters.first);
+  Value *Second = Builder.CreateShuffleVector(Vec, GetUndefVec(EltTy, NumElts),
+                                              Splitters.second);
+  return std::make_pair(First, Second);
+}
+
+void SplitScatter(CallInst *CI) {
+  assert(GenXIntrinsic::getAnyIntrinsicID(CI) ==
+         llvm::GenXIntrinsic::genx_scatter_scaled);
+  Type *DataTy = CI->getArgOperand(5)->getType();
+  unsigned NumElts = DataTy->getVectorNumElements();
+  assert(NumElts % 2 == 0);
+
+  Type *I32Ty = Type::getInt32Ty(CI->getContext());
+  std::pair<Value *, Value *> Splitters = GetUndefPair(I32Ty, NumElts / 2);
+  Splitters.first = FillVecWithSeqVals(Splitters.first, 0, CI);
+  Splitters.second = FillVecWithSeqVals(Splitters.second, NumElts / 2, CI);
+
+  Value *Pred = CI->getArgOperand(0);
+  std::pair<Value *, Value *> NewPreds = SplitVec(Pred, NumElts, CI, Splitters);
+
+  Value *EltOffsets = CI->getArgOperand(5);
+  std::pair<Value *, Value *> NewEltOffsets =
+      SplitVec(EltOffsets, NumElts, CI, Splitters);
+
+  Value *OldVal = CI->getArgOperand(6);
+  std::pair<Value *, Value *> OldVals =
+      SplitVec(OldVal, NumElts, CI, Splitters);
+
+  auto IID = llvm::GenXIntrinsic::genx_scatter_scaled;
+  Function *F = GenXIntrinsic::getGenXDeclaration(CI->getModule(), IID,
+                                          {NewPreds.first->getType(),
+                                           NewEltOffsets.first->getType(),
+                                           OldVals.first->getType()});
+
+  Value *LogNumBlock = CI->getArgOperand(1);
+  Value *Scale = CI->getArgOperand(2);
+  Value *Surface = CI->getArgOperand(3);
+  Value *Offset = CI->getArgOperand(4);
+
+  CallInst *FirstScatter =
+      IntrinsicInst::Create(F, {NewPreds.first, LogNumBlock, Scale, Surface,
+                                Offset, NewEltOffsets.first, OldVals.first});
+  CallInst *SecondScatter =
+      IntrinsicInst::Create(F, {NewPreds.second, LogNumBlock, Scale, Surface,
+                                Offset, NewEltOffsets.second, OldVals.second});
+
+  FirstScatter->insertAfter(CI);
+  SecondScatter->insertAfter(FirstScatter);
+
+  CI->eraseFromParent();
+}
+
+void SplitGather(CallInst *CI) {
+  assert(GenXIntrinsic::getAnyIntrinsicID(CI) ==
+         llvm::GenXIntrinsic::genx_gather_scaled);
+  Type *DstTy = CI->getType();
+  unsigned NumElts = DstTy->getVectorNumElements();
+  assert(NumElts % 2 == 0);
+
+  Type *I32Ty = Type::getInt32Ty(CI->getContext());
+  std::pair<Value *, Value *> Splitters = GetUndefPair(I32Ty, NumElts / 2);
+  Splitters.first = FillVecWithSeqVals(Splitters.first, 0, CI);
+  Splitters.second = FillVecWithSeqVals(Splitters.second, NumElts / 2, CI);
+
+  Value *Pred = CI->getArgOperand(0);
+  std::pair<Value *, Value *> NewPreds = SplitVec(Pred, NumElts, CI, Splitters);
+
+  Value *EltOffsets = CI->getArgOperand(5);
+  std::pair<Value *, Value *> NewEltOffsets =
+      SplitVec(EltOffsets, NumElts, CI, Splitters);
+
+  Value *OldVal = CI->getArgOperand(6);
+  std::pair<Value *, Value *> OldVals =
+      SplitVec(OldVal, NumElts, CI, Splitters);
+  auto IID = llvm::GenXIntrinsic::genx_gather_scaled;
+  Function *F = GenXIntrinsic::getGenXDeclaration(CI->getModule(), IID,
+                                          {OldVals.first->getType(),
+                                           NewPreds.first->getType(),
+                                           NewEltOffsets.first->getType()});
+
+  Value *LogNumBlock = CI->getArgOperand(1);
+  Value *Scale = CI->getArgOperand(2);
+  Value *Surface = CI->getArgOperand(3);
+  Value *Offset = CI->getArgOperand(4);
+
+  CallInst *FirstGather =
+      IntrinsicInst::Create(F, {NewPreds.first, LogNumBlock, Scale, Surface,
+                                Offset, NewEltOffsets.first, OldVals.first});
+  CallInst *SecondGather =
+      IntrinsicInst::Create(F, {NewPreds.second, LogNumBlock, Scale, Surface,
+                                Offset, NewEltOffsets.second, OldVals.second});
+
+  FirstGather->insertAfter(CI);
+  SecondGather->insertAfter(FirstGather);
+
+  Value *Joiner = FillVecWithSeqVals(GetUndefVec(I32Ty, NumElts), 0, CI);
+  IRBuilder<> Builder(CI);
+  Builder.SetInsertPoint(SecondGather->getNextNode());
+  Value *JointGather =
+      Builder.CreateShuffleVector(FirstGather, SecondGather, Joiner);
+
+  CI->replaceAllUsesWith(JointGather);
+  CI->eraseFromParent();
+}
+
+void GenXThreadPrivateMemory::addUsers(Instruction *I) {
+  assert(I);
+  for (auto Usr : I->users()) {
+    Instruction *ToAdd = cast<Instruction>(Usr);
+    auto Found = m_AlreadyAdded.find(ToAdd);
+    if (Found == m_AlreadyAdded.end()) {
+      m_AlreadyAdded.insert(ToAdd);
+      m_AIUsers.push(ToAdd);
+    }
+  }
+}
+
+void GenXThreadPrivateMemory::collectEachAllocaUsers() {
+  assert(m_AIUsers.empty());
+  m_AlreadyAdded.clear();
+  for (auto B = m_allocaToIntrinsic.begin(), E = m_allocaToIntrinsic.end();
+       B != E; ++B) {
+    Instruction *I = dyn_cast<Instruction>(B->first);
+    assert(I);
+    addUsers(I);
+  }
+}
+
+void GenXThreadPrivateMemory::addUsersIfNeeded(Instruction *I) {
+  bool isGatherScatterPrivate = false;
+  if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(I)) {
+    unsigned ID = GenXIntrinsic::getAnyIntrinsicID(CI);
+    switch (ID) {
+    case GenXIntrinsic::genx_gather_private:
+    case GenXIntrinsic::genx_scatter_private:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      isGatherScatterPrivate = true;
+      break;
+    default:
+      break;
+    }
+  }
+  if (m_ST->useGlobalMem() ||
+      (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isGatherScatterPrivate))
+    addUsers(I);
+}
+
+// pre-transformation analysis to determine
+// which kind of mem should we place TPM at
+static bool checkSVMNecessary(Instruction *Inst, int LoadsMet = 0) {
+  // do not handle ConstExprs for now
+  if (!Inst)
+    return false;
+  if (isa<LoadInst>(Inst)) {
+    if (LoadsMet > 0)
+      return true;
+    else
+      ++LoadsMet;
+  } else if (auto *CI = dyn_cast<CallInst>(Inst)) {
+    auto IID = GenXIntrinsic::getAnyIntrinsicID(CI);
+    if (IID == GenXIntrinsic::genx_gather_private ||
+        IID == GenXIntrinsic::genx_scatter_private ||
+        IID == GenXIntrinsic::not_any_intrinsic) {
+      // do not process users of priv mem intrinsics
+      // or calls to other functions
+      return false;
+    }
+  } else if (isa<PHINode>(Inst)) {
+    // do not go thru phi as loops may appear and
+    // it doesn't seem necessary for the analysis now
+    return false;
+  }
+  bool Result = false;
+  for (auto *U : Inst->users()) {
+    Result |= checkSVMNecessary(dyn_cast<Instruction>(U), LoadsMet);
+    if (Result)
+      break;
+  }
+  return Result;
+}
+
+// required to pass find_if's typecheck
+static bool checkSVMNecessaryPred(Instruction *Inst) {
+  return checkSVMNecessary(Inst);
+}
+
+bool GenXThreadPrivateMemory::runOnModule(Module &M) {
+  auto STP = getAnalysisIfAvailable<GenXSubtargetPass>();
+  assert(STP);
+  m_ST = STP->getSubtarget();
+  for (auto &F : M)
+    visit(F);
+  if (std::find_if(m_alloca.begin(), m_alloca.end(), checkSVMNecessaryPred) !=
+      m_alloca.end())
+    m_ST->setUseGlobalMem();
+  bool Result = false;
+  for (auto &F : M)
+    Result |= runOnFunction(F);
+  return Result;
+}
+
+bool GenXThreadPrivateMemory::runOnFunction(Function &F) {
+  m_DL = &F.getParent()->getDataLayout();
+  m_stack = m_ST->stackSurface();
+
+  m_ctx = &F.getContext();
+  m_DL = &F.getParent()->getDataLayout();
+  m_alloca.clear();
+  m_gather.clear();
+  m_scatter.clear();
+  m_allocaToIntrinsic.clear();
+  m_AIUsers = {};
+  m_AlreadyAdded.clear();
+
+  visit(F);
+
+  for (auto Alloca : m_alloca) {
+    Type *AllocaTy = Alloca->getAllocatedType();
+
+    auto IID = llvm::GenXIntrinsic::genx_alloca;
+    Function *IntrDecl = GenXIntrinsic::getGenXDeclaration(Alloca->getModule(), IID, AllocaTy);
+    CallInst *AllocaIntr =
+        IntrinsicInst::Create(IntrDecl, {Constant::getNullValue(AllocaTy)});
+    AllocaIntr->insertAfter(Alloca);
+    m_allocaToIntrinsic[Alloca] = AllocaIntr;
+  }
+
+  // Firstly, we resolve dependencies in PHI nodes (see comments in
+  // preparePhiForReplacement).
+  collectEachAllocaUsers();
+  bool Changed = false;
+  while (!m_AIUsers.empty()) {
+    Instruction *I = m_AIUsers.front();
+    m_AIUsers.pop();
+
+    addUsersIfNeeded(I);
+
+    if (PHINode *Phi = dyn_cast<PHINode>(I))
+      Changed |= preparePhiForReplacement(Phi);
+  }
+
+  // Main loop where instructions are replaced one by one.
+  collectEachAllocaUsers();
+  while (!m_AIUsers.empty()) {
+    Instruction *I = m_AIUsers.front();
+    m_AIUsers.pop();
+
+    addUsersIfNeeded(I);
+
+    if (auto *LdI = dyn_cast<LoadInst>(I))
+      Changed |= replaceLoad(LdI);
+    else if (auto *StI = dyn_cast<StoreInst>(I))
+      Changed |= replaceStore(StI);
+    else if (auto *PTI = dyn_cast<PtrToIntInst>(I))
+      Changed |= replacePTI(PTI);
+    else if (auto* AddrCast = dyn_cast<AddrSpaceCastInst>(I))
+      Changed |= replaceAddrSpaceCast(AddrCast);
+    else if (isa<IntToPtrInst>(I) || isa<BitCastInst>(I)) {
+      // resolve all IntToPtr users and remove it.
+      if (I->use_empty()) {
+        I->eraseFromParent();
+        Changed = true;
+      }
+    } else if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(I)) {
+      unsigned ID = GenXIntrinsic::getAnyIntrinsicID(CI);
+      if (ID == GenXIntrinsic::genx_gather_private)
+        Changed |= replaceGatherPrivate(CI);
+      else if (ID == GenXIntrinsic::genx_scatter_private)
+        Changed |= replaceScatterPrivate(CI);
+      else if (ID == Intrinsic::lifetime_start ||
+               ID == Intrinsic::lifetime_end) {
+        CI->eraseFromParent();
+        Changed = true;
+      }
+    } else if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+      if (isa<PointerType>(Phi->getType()))
+        Changed |= replacePhi(Phi);
+    } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) {
+      if (isa<PointerType>(Sel->getType()))
+        Changed |= replaceSelect(Sel);
+    }
+
+    if (m_AIUsers.empty()) {
+      if (!Changed)
+        report_fatal_error("Thread private memory: cannot resolve all alloca uses");
+      Changed = false;
+      collectEachAllocaUsers();
+    }
+  }
+
+  for (auto AllocaPair : m_allocaToIntrinsic) {
+    assert(AllocaPair.first->use_empty() &&
+           "uses of replaced alloca aren't empty");
+    AllocaPair.first->eraseFromParent();
+  }
+
+  // TODO: Rewrite split conditions due to possible exec sizes are 1, 2, 4, 8,
+  // 16 and 32.
+  for (auto CI : m_gather) {
+    Type *DstTy = CI->getType();
+    unsigned NumElts = DstTy->getVectorNumElements();
+    unsigned EltSz = DstTy->getVectorElementType()->getPrimitiveSizeInBits();
+    unsigned ExecSz = NumElts * EltSz;
+
+    if (ExecSz > 2 * genx::GRFBits || NumElts > 32)
+      SplitGather(CI);
+  }
+
+  for (auto CI : m_scatter) {
+    Type *DataTy =
+        CI->getArgOperand(m_ST->useGlobalMem() ? 3 : 5)->getType();
+    unsigned NumElts = DataTy->getVectorNumElements();
+    unsigned EltSz = DataTy->getVectorElementType()->getPrimitiveSizeInBits();
+    unsigned ExecSz = NumElts * EltSz;
+
+    if (ExecSz > 2 * genx::GRFBits || NumElts > 32)
+      SplitScatter(CI);
+  }
+
+  return !m_allocaToIntrinsic.empty();
+}
+
+void GenXThreadPrivateMemory::visitAllocaInst(AllocaInst &I) {
+  m_alloca.push_back(&I);
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXTidyControlFlow.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTidyControlFlow.cpp
new file mode 100644
index 000000000000..4f905d4eaf61
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTidyControlFlow.cpp
@@ -0,0 +1,302 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXTidyControlFlow
+/// -------------------
+///
+/// This pass tidies the control flow in the following ways:
+///
+/// 1. It removes empty blocks (a block is empty if all it contains is an
+///    unconditional branch), and thus reduces branch chains in the generated
+///    code.  It is needed because often a block inserted by critical edge
+///    splitting is not needed for any phi copies.
+///
+/// 2. It reorders blocks to increase fallthrough generally, and specifically
+///    to ensure that SIMD CF goto and join have the required structure: the
+///    "false" successor must be fallthrough and the "true" successor must be
+///    forward. (The '"true" successor must be forward' requirement is a vISA
+///    requirement, because vISA goto/join does not specify JIP, and the
+///    finalizer reconstructs it on this assumption.)
+///
+/// 3. fixGotoOverBranch: The pass spots where there is a SIMD CF goto over an
+///    unconditional branch, and turns the combination into a backwards goto.
+/// 
+///    After reordering blocks, we know that any simd goto has its "false" successor as
+///    the following block. If all of the following are true:
+/// 
+///    a. its "true" successor just branches over that same block;
+/// 
+///    b. that block contains only an unconditional branch;
+/// 
+///    c. the UIP of the goto (the join whose RM it updates) is the same as the
+///       "true" successor;
+/// 
+///    d. the goto condition is not constant 0 (this condition is because we
+///       cannot represent a backwards simd goto with this, and it is too late to
+///       allocate it a register);
+/// 
+///    then we have the end of a simd do..while loop, and we can optimize to a
+///    backwards simd goto.
+/// 
+///    We represent a backwards simd goto in the IR by having the "true"
+///    successor as the following block. GenXVisaFuncWriter can then spot that it
+///    is a backwards simd goto, and it needs its condition inverting.
+///
+/// 4. Ensure that there is a single return block and it is the last block.
+///    These are required by the vISA's structurizer.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_TIDYCONTROLFLOW"
+
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXGotoJoin.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "GenXNumbering.h"
+#include "GenXSubtarget.h"
+#include "GenXUtil.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+
+/***********************************************************************
+ * GenXTidyControlFlow pass declaration
+ */
+namespace {
+  class GenXTidyControlFlow : public FunctionPass {
+    const GenXSubtarget *ST = nullptr;
+    bool Modified;
+  public:
+    static char ID;
+    explicit GenXTidyControlFlow() : FunctionPass(ID), Modified(false) {}
+    virtual StringRef getPassName() const { return "GenX tidy control flow"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<GenXModule>();
+      AU.addPreserved<GenXGroupBaling>();
+      AU.addPreserved<GenXLiveness>();
+      AU.addPreserved<GenXNumbering>();
+      AU.addPreserved<FunctionGroupAnalysis>();
+      AU.addRequired<LoopInfoWrapperPass>();
+    }
+
+    bool runOnFunction(Function &F);
+    // createPrinterPass : get a pass to print the IR, together with the GenX
+    // specific analyses
+    virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const
+    { return createGenXPrinterPass(O, Banner); }
+  private:
+    void removeEmptyBlocks(Function *F);
+    void reorderBlocks(Function *F);
+    void fixGotoOverBranch(Function *F);
+    void fixReturns(Function *F);
+  };
+} // end anonymous namespace.
+
+char GenXTidyControlFlow::ID = 0;
+
+FunctionPass *llvm::createGenXTidyControlFlowPass() {
+  return new GenXTidyControlFlow;
+}
+
+/***********************************************************************
+ * GenXTidyControlFlow::runOnFunction : process a function
+ */
+bool GenXTidyControlFlow::runOnFunction(Function &F)
+{
+  auto P = getAnalysisIfAvailable<GenXSubtargetPass>();
+  ST = P ? P->getSubtarget() : nullptr;
+  Modified = false;
+  removeEmptyBlocks(&F);
+  reorderBlocks(&F);
+  fixGotoOverBranch(&F);
+  fixReturns(&F);
+  return Modified;
+}
+
+/***********************************************************************
+ * removeEmptyBlocks
+ */
+void GenXTidyControlFlow::removeEmptyBlocks(Function *F)
+{
+  Function::iterator fi = F->begin(), fe = F->end();
+  // Don't consider the entry block.
+  for (++fi; fi != fe; ) {
+    BasicBlock *BB = &*fi;
+    // Increment iterator here as we may be removing this block.
+    ++fi;
+    // FIXME: By claiming preserving liveness, we cannot remove phi(s) in empty
+    // blocks. Need to adjust the pass order if such phi(s) really need
+    // eliminating.
+    BranchInst *BI = dyn_cast<BranchInst>(&BB->front());
+    if (!BI || !BI->isUnconditional())
+      continue;
+    // Do not remove BB if it has more than one predecessor.
+    if (!BB->hasOneUse())
+      continue;
+    // Check if this is a critical edge splitting block whose predecessor is
+    // the "false" leg of a goto/join. In that case we do not remove the
+    // block, as reorderBlocks below may rely on it to ensure that the "false"
+    // successor of a goto/join can be made fallthrough.
+    if (BB->hasOneUse()
+        && BB->use_begin()->getOperandNo() == 1 /*false successor*/
+        && GotoJoin::isBranchingGotoJoinBlock(cast<Instruction>(
+            BB->use_begin()->getUser())->getParent())) {
+      LLVM_DEBUG(dbgs() << "removeEmptyBlocks: not removing " << BB->getName() << "\n");
+      continue;
+    }
+    // We are removing this block. First adjust phi nodes in the successor.
+    auto Succ = BI->getSuccessor(0);
+    adjustPhiNodesForBlockRemoval(Succ, BB);
+    // Change all of BB's uses to use its successor instead.
+    assert(BB->getSinglePredecessor() != BB && "self loop");
+    BB->replaceAllUsesWith(BI->getSuccessor(0));
+    BI->eraseFromParent();
+    BB->eraseFromParent();
+    Modified = true;
+  }
+}
+
+/***********************************************************************
+ * reorderBlocks : reorder blocks to increase fallthrough, and specifically
+ *    to satisfy the requirements of SIMD control flow
+ */
+void GenXTidyControlFlow::reorderBlocks(Function *F)
+{
+  LoopInfo& LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  if (LI.empty())
+    LayoutBlocks(*F);
+  else
+    LayoutBlocks(*F, LI);
+  Modified = true;
+}
+
+/***********************************************************************
+ * fixGotoOverBranch : fix a (simd) goto over a branch into a backwards goto
+ *
+ * See the comment at the top of the file.
+ */
+void GenXTidyControlFlow::fixGotoOverBranch(Function *F)
+{
+  for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+    BasicBlock *BB = &*fi;
+    auto Goto = GotoJoin::isGotoBlock(BB);
+    if (!Goto)
+      continue;
+    auto Br = cast<BranchInst>(BB->getTerminator());
+    if (!Br->isConditional())
+      continue;
+    // We have a block ending with a conditional branch that is a goto.
+    // Now check whether it branches over an unconditional branch.
+    auto Succ = BB->getNextNode();
+    if (!Succ || !Succ->hasOneUse())
+      continue;
+    if (Br->getSuccessor(0)->getPrevNode() != Succ)
+      continue;
+    auto SuccBr = dyn_cast<BranchInst>(Succ->getFirstNonPHIOrDbg());
+    if (!SuccBr || SuccBr->isConditional())
+      continue;
+    // The goto branches over just an unconditional branch.
+    // Check whether its UIP is the same as the branch target.
+    auto Join = GotoJoin::findJoin(Goto);
+    if (!Join || Join->getParent() != Br->getSuccessor(0))
+      continue;
+    // Check that the goto condition is not constant.
+    if (isa<Constant>(Goto->getOperand(2)))
+      continue;
+    // Change the goto's "false" successor to the target of the unconditional
+    // branch, and remove Succ so the goto's "true" successor becomes
+    // fallthrough. This then represents a backward goto.
+    adjustPhiNodesForBlockRemoval(SuccBr->getSuccessor(0), Succ);
+    Br->setSuccessor(1, SuccBr->getSuccessor(0));
+    Succ->eraseFromParent();
+    Modified = true;
+  }
+}
+
+/******************************************************************************
+ * fixReturns : only keep a single return block and ensure it is the last block
+ * of a function.
+ */
+void GenXTidyControlFlow::fixReturns(Function *F) {
+  // Loop over all of the blocks in a function, tracking all of the blocks
+  // that return.
+  SmallVector<BasicBlock *, 16> ReturningBlocks;
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I)
+    if (isa<ReturnInst>(I->getTerminator()))
+      ReturningBlocks.push_back(&*I);
+
+  // We need to insert a new basic block into the function,
+  // add a PHI nodes (if the function returns values), and convert
+  // all of the return instructions into unconditional branches.
+  //
+  if (ReturningBlocks.size() == 1) {
+    BasicBlock *RetBlock = ReturningBlocks.front();
+    BasicBlock *LastBlock = &F->back();
+    if (LastBlock != RetBlock) {
+      RetBlock->moveAfter(LastBlock);
+      Modified = true;
+    }
+  } else if (ReturningBlocks.size() > 1) {
+    BasicBlock *NewRetBlock =
+        BasicBlock::Create(F->getContext(), "UnifiedReturnBlock", F);
+    PHINode *PN = nullptr;
+    if (F->getReturnType()->isVoidTy())
+      ReturnInst::Create(F->getContext(), nullptr, NewRetBlock);
+    else {
+      // If the function doesn't return void, add a PHI node to the block.
+      PN = PHINode::Create(F->getReturnType(), ReturningBlocks.size(),
+                           "UnifiedRetVal");
+      NewRetBlock->getInstList().push_back(PN);
+      ReturnInst::Create(F->getContext(), PN, NewRetBlock);
+    }
+
+    // Loop over all of the blocks, replacing the return instruction with an
+    // unconditional branch.
+    for (auto BB : ReturningBlocks) {
+      // Add an incoming element to the PHI node for every return instruction
+      // that is merging into this new block.
+      if (PN)
+        PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+      BB->getInstList().pop_back(); // Remove the return inst.
+      BranchInst::Create(NewRetBlock, BB);
+    }
+    Modified = true;
+  }
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXUnbaling.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUnbaling.cpp
new file mode 100644
index 000000000000..ffa5220fe2be
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUnbaling.cpp
@@ -0,0 +1,1204 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXUnbaling
+/// ------------
+///
+/// After live range building, GenXUnbaling spots cases where baling is harmful
+/// due to extending the live range of a big vector.
+///
+/// The need for the unbaling pass
+/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+///
+/// A *two address operation* (mainly wrregion, but also a few intrinsics that
+/// do a predicated read from a shared function unit such as memory) is one
+/// where the result needs to be in the same register as one operand, the *two
+/// address operand*. GenXCoalescing attempts to coalesce the two together, but
+/// it fails if the live range of the two address operand extends beyond the
+/// two address instruction. Failure of coalescing means that you get extra
+/// code inserted before to copy the whole big vector, and increased register
+/// pressure because two values of the big vector are live at the same time.
+///
+/// Similarly, with a phi node incoming, GenXCoalescing attempts to coalesce
+/// the incoming with the phi node result. Failure means that you get extra
+/// code inserted to copy the value at the end of the incoming block.
+///
+/// The existence of this problem is due to our use of SSA. Both the input and
+/// the output of the wrregion (or the phi incoming and result) are probably
+/// the same big vector variable in the source code, and a more traditional
+/// compiler would treat the variable as a single (non-SSA) value assigned to
+/// its own register, avoiding the need to treat the wrregion specially as a
+/// two address operation.
+///
+/// With the traditional approach, code motion is more difficult, as an
+/// instruction cannot be moved past any other instruction that modifies any of
+/// the potentially moving instruction's uses.
+///
+/// With our SSA approach, code motion (of an instruction with no memory
+/// interaction) is much easier, and we use that in GenXBaling to bale an
+/// instruction into another one without needing to check anything in between.
+/// (Even though GenXBaling often does not actually move the baled in
+/// instruction in IR, it must be treated as if it is at the position of the
+/// head of the bale.)
+///
+/// The price we pay for that flexibility is that sometimes we move code even
+/// when it is harmful to do so.
+///
+/// The most common situation where it would fail to coalesce is where
+/// legalization has created a sequence of wrregions, and the "old value" input
+/// to the first one is also used in a rdregion baled in to each one of the
+/// wrregions.
+///
+/// Other situations include where some other rdregion use of the two address
+/// operand is user code that has been baled to after the instruction, and
+/// where the user code actually takes a copy of the big vector and then uses
+/// one or more regions out of it after the two address instruction.
+///
+/// The GenXUnbaling pass implements two transformations: the non-overlapping
+/// region optimization, and the unbaling transformation.
+///
+/// Non-overlapping region optimization
+/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+///
+/// A common cause of the two address operand, the "old value" input, of a
+/// wrregion extending beyond the wrregion is that the wrregion is the first in
+/// a sequence created by GenXLegalization, and the same vector is used as an
+/// input to rdregions baled in to subsequent bales in the sequence.
+///
+/// In this case, a baled in rdregion is reading a part of the vector that has
+/// not been overwritten by any earlier wrregion in the sequence.
+///
+/// The non-overlapping region optimization detects this case by checking which
+/// regions of the vector have been overwritten by earlier wrregions in the
+/// sequence. If the region read by the rdregion has not been overwritten, then
+/// the optimization can change the input to the rdregion to the result of the
+/// previous wrregion in the sequence without changing the semantics.
+///
+/// If this succeeds for all the rdregions from the same vector in the
+/// sequence, then the live range no longer reaches beyond the first wrregion
+/// and it can be two address coalesced.
+///
+/// The non-overlapping region optimization also handles a similar case where
+/// the "old value" input to the first wrregion in the sequence is undef, but
+/// of the same type as the input to rdregions through the sequence. As well as
+/// modifying each rdregion input to the result of the previous wrregion, it
+/// changes the undef input to the first wrregion to the same input vector.
+/// This also stops the live range of the inputs to the rdregions overlapping
+/// the result, and thus saves register pressure. However it can make the code
+/// worse if there are further uses of the input after the sequence, so it only
+/// makes the transformation if there are no further uses.
+///
+/// Unbaling transformation
+/// ^^^^^^^^^^^^^^^^^^^^^^^
+///
+/// At its simplest, the unbaling transformation looks at each two address
+/// instruction and phi node incoming, and then looks at the uses of the "old
+/// value" input:
+///
+/// * A use before the original two address instruction can be ignored as it
+///   does not cause the "old value" input to be live beyond that instruction.
+///
+/// * A use after the original two address instruction that is not a rdregion
+///   cannot be handled, so causes the pass to abandon processing this original
+///   two address operation.
+///
+/// * A rdregion use after the original two address instruction is unbaled so
+///   it regains its pre-baling position before the original two address
+///   instruction.
+///
+/// Thus the use of the "old value" input in the two address instruction
+/// becomes a kill use, and coalescing at that instruction will succeed. Or the
+/// phi incoming becomes a kill use, and coalescing it with the phi result will
+/// succeed.
+///
+/// But there are complications:
+///
+/// Moving the unbaled instruction
+/// """"""""""""""""""""""""""""""
+///
+/// Unbaling an instruction means that its position in the code is now
+/// considered to be the same as its position in the IR. Sometimes that is
+/// where we want it (before the original two address instruction), since
+/// baling tends not to move code. But sometimes it is still after the original
+/// two address instruction, most likely because of the order of code split by
+/// GenXLegalization.
+///
+/// Thus we may need to move the unbaled instruction up to before the original
+/// two address instruction. In fact we need to move the whole sub-bale (the
+/// new bale headed by the instruction we are unbaling). A rdregion can have an
+/// llvm.genx.add.address intrinsic baled in if it has a variable index.
+///
+/// If the unbaled instruction is dominated by the original two address
+/// instruction, we move it to just before that. Otherwise, we move it to the
+/// end of the basic block that is the nearest common dominator of the two
+/// locations.
+///
+/// To move a bale up, we need to ensure that all outside-bale operands are
+/// defined before where we are going to move it to. If that is not the case,
+/// then unbaling for the original two address instruction fails.
+///
+/// Moving when there is a variable index
+/// """""""""""""""""""""""""""""""""""""
+///
+/// For a rdregion with a variable index, there is an llvm.genx.conv.address
+/// intrinsic, which represents the setting of an address register relative to
+/// the base register that the rdregion will access. GenXCategory ensures that
+/// there is one llvm.genx.conv.address intrinsic for each variable index
+/// rdregion or wrregion, since it does not know which region accesses are
+/// going to be in the same register. Commoning up of address conversions is
+/// done later, after coalescing has decided which ones are in the same base
+/// register.
+///
+/// The problem for GenXUnbaling is that the llvm.genx.conv.address is likely
+/// to be just before the rdregion, which stops the rdregion being moved to
+/// before the original two address instruction.
+///
+/// The solution is to pretend that the llvm.genx.conv.address (and anything it
+/// bales in, probably a zext/sext) is part of the rdregion's bale, just for
+/// GenXUnbaling's purposes of telling whether it is OK to move it, and then
+/// actually moving it. GenXBaling::buildBale() has an extra IncludeAddr flag
+/// to enable this behavior.
+///
+/// What is before and after?
+/// """""""""""""""""""""""""
+///
+/// The notion of whether an instruction is before or after the original two
+/// address instruction is more complex in the presence of control flow.
+///
+/// This pass distinguishes the following cases:
+///
+/// * Before: The instruction dominates the original two address instruction,
+///   so can be considered before it. No use in the instruction reaches back to
+///   the original two address instruction.
+///
+/// * After: The original two address instruction dominates the instruction, so
+///   can be considered after it. A use in the instruction causes liveness to
+///   reach back to the original two address instruction (as long as the use's
+///   definition is before that).
+///
+/// * Reaches: Neither dominates the other, but a use in the instruction causes
+///   liveness to reach back to the original two address instruction anyway.
+///   This is determined by actually tracing back all the branches through the
+///   control graph, abandoning a branch when it rejoins with another one or
+///   reaches the definition.
+///
+/// * Not reaches: Neither dominates the other, but we can prove that a use in
+///   the instruction does not cause liveness to reach back to the original two
+///   address instruction.
+///
+/// When processing a phi incoming rather than a two address instruction, it is
+/// considered to be at the end of the corresponding incoming block, rather
+/// than at the site of the phi node.
+///
+/// If we have "not reaches", then the use can be ignored in the same way as a
+/// "before" use.
+///
+/// If we have "reaches", then we can still unbale it. If the new sub-bale
+/// needs moving, then we move it to the end of the block that is the nearest
+/// common dominator of its old location and the original two address
+/// instruction.
+///
+/// A use in a phi node is considered to be at the end of the incoming block
+/// for the purposes of determining its position.
+///
+/// Commoning up unbaled sub-bales
+/// """"""""""""""""""""""""""""""
+///
+/// It is often the case that baling has caused the same rdregion to be cloned
+/// (because a baled in instruction can only have a single use), so unbaling
+/// the baled in rdregions causes duplicate instructions. No CSE is run after
+/// this point, as that would cause various problems including messing up the
+/// baling and the address conversion.
+///
+/// Therefore, this pass needs to spot when it is unbaling duplicate sub-bales
+/// and common them up.
+///
+/// Unbaling the main instruction instead of the rdregion
+/// """""""""""""""""""""""""""""""""""""""""""""""""""""
+///
+/// In some cases, the rdregion is in a bale that also contains another
+/// rdregion of the same big vector. Unbaling the two rdregions separately
+/// would create two extra instructions. We can reduce that to one extra
+/// instruction by instead unbaling the main instruction from the wrregion at
+/// the head, so only the wrregion is left at its original position in the code
+/// and the rest of the bale is moved up.
+///
+/// The pass only does that if it detects more than one use of the big vector
+/// in the bale.
+///
+/// When trying to do this, and the proposed sub-bale needs to be moved rather
+/// than just unbaled, we may see that not all outside-bale operands are
+/// defined before the original two address instruction. In that case, we
+/// abandon the attempt to unbale the main instruction, and instead go back to
+/// unbaling just the rdregion, which may succeed.
+///
+/// Bitcasts
+/// """"""""
+///
+/// Because GenXCoalescing does "copy coalescing" of bitcasts first, we need to
+/// consider not just the rdregion uses of the input to the original two
+/// address instruction, but also uses of the whole tree of bitcasts containing
+/// it. Not doing that stops the optimization working when the source CM code
+/// contains format() functions.
+///
+/// Such bitcasts may need to be moved up to just before the original two
+/// address instruction, in case any use of it is moved. In fact we just move
+/// the whole tree of bitcasts to just after the definition of the root of the
+/// tree.  This does not worsen code quality because the bitcasts will all be
+/// copy coalesced together anyway.
+///
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_UNBALING"
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXGotoJoin.h"
+#include "GenXIntrinsics.h"
+#include "GenXLiveness.h"
+#include "GenXNumbering.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+
+#include <numeric>
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+
+class GenXUnbaling : public FunctionGroupPass {
+  enum { UNKNOWN, BEFORE, AFTER, NOTREACHES, REACHES };
+
+  GenXBaling *Baling;
+  GenXLiveness *Liveness;
+  GenXNumbering *Numbering;
+  DominatorTree *DT;
+  bool Modified;
+  BasicBlock *CurBlock;
+  std::map<BasicBlock *, int> ReachabilityCache;
+  std::set<Instruction *> InstSeen;
+  ValueMap<Instruction *, bool> InstSeenInProcessNonOverlappingRegion;
+  SmallVector<Instruction *, 4> ToErase;
+  // Fields used to process a single two address instruction.
+  struct ToUnbaleEntry {
+    Instruction *Inst; // instruction to unbale
+    Instruction *InsertBefore; // where to move it to, 0 if no move
+    ToUnbaleEntry(Instruction *Inst, Instruction *InsertBefore)
+        : Inst(Inst), InsertBefore(InsertBefore) {}
+  };
+  SmallVector<ToUnbaleEntry, 4> ToUnbale;
+  std::map<Instruction *, Instruction *> CommonBaleMap;
+public:
+  static char ID;
+  explicit GenXUnbaling() : FunctionGroupPass(ID) {}
+  StringRef getPassName() const override { return "GenX unbaling"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunctionGroup(FunctionGroup &FG) override;
+  // createPrinterPass : get a pass to print the IR, together with the GenX
+  // specific analyses
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const
+    override { return createGenXGroupPrinterPass(O, Banner); }
+private:
+  void processFunc(Function *F);
+  void shortenLiveRanges(Function *F);
+  bool interfere(Value *V1, Value *V2);
+  void processTwoAddrOrPhi(Instruction *Inst, unsigned TwoAddrOperandNum);
+  bool scanUsesForUnbaleAndMove(Instruction *Inst, Value *TwoAddrOperand);
+  int getReachability(Instruction *Inst, Instruction *Def);
+  void processNonOverlappingRegion(CallInst *Wr);
+};
+
+} // end anonymous namespace
+
+namespace llvm { void initializeGenXUnbalingPass(PassRegistry &); }
+char GenXUnbaling::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXUnbaling, "GenXUnbaling", "GenXUnbaling", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_DEPENDENCY(GenXNumbering)
+INITIALIZE_PASS_END(GenXUnbaling, "GenXUnbaling", "GenXUnbaling", false, false)
+
+FunctionGroupPass *llvm::createGenXUnbalingPass() {
+  initializeGenXUnbalingPass(*PassRegistry::getPassRegistry());
+  return new GenXUnbaling();
+}
+
+void GenXUnbaling::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionGroupPass::getAnalysisUsage(AU);
+  AU.addRequired<DominatorTreeGroupWrapperPass>();
+  AU.addRequired<GenXGroupBaling>();
+  AU.addRequired<GenXLiveness>();
+  AU.addRequired<GenXNumbering>();
+  AU.addPreserved<DominatorTreeGroupWrapperPass>();
+  AU.addPreserved<GenXGroupBaling>();
+  AU.addPreserved<GenXLiveness>();
+  AU.addPreserved<GenXModule>();
+  AU.addPreserved<FunctionGroupAnalysis>();
+  AU.setPreservesCFG();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run the liveness analysis for this FunctionGroup
+ */
+bool GenXUnbaling::runOnFunctionGroup(FunctionGroup &FG) {
+  Baling = &getAnalysis<GenXGroupBaling>();
+  Liveness = &getAnalysis<GenXLiveness>();
+  Numbering = &getAnalysis<GenXNumbering>();
+  Modified = false;
+  for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi)
+    processFunc(*fgi);
+  return Modified;
+}
+
+/***********************************************************************
+ * processFunc : process one function in GenXUnbaling
+ *
+ * This does a postordered depth first traversal of the CFG, processing
+ * instructions within a basic block in reverse, to ensure that we see a def
+ * after its uses (ignoring phi node uses).  That is required for the
+ * non-overlapping region optimization, as we need to perform that on a bale
+ * before an earlier wrregion sees the use in the rdregion and unbales it.
+ */
+void GenXUnbaling::processFunc(Function *F) {
+  LLVM_DEBUG(dbgs() << "GenXUnbaling on " << F->getName() << "\n");
+  DT = getAnalysis<DominatorTreeGroupWrapperPass>().getDomTree(F);
+  for (po_iterator<BasicBlock *> i = po_begin(&F->getEntryBlock()),
+      e = po_end(&F->getEntryBlock()); i != e; ++i) {
+    CurBlock = *i;
+    // Process our incomings of successors' phi nodes.
+    auto TI = CurBlock->getTerminator();
+    for (unsigned si = 0, se = TI->getNumSuccessors(); si != se; ++si) {
+      BasicBlock *Succ = TI->getSuccessor(si);
+      for (auto bi = Succ->begin(); ; ++bi) {
+        auto Phi = dyn_cast<PHINode>(bi);
+        if (!Phi)
+          break;
+        unsigned IncomingNum = Phi->getBasicBlockIndex(CurBlock);
+        processTwoAddrOrPhi(Phi, IncomingNum);
+      }
+    }
+    for (auto Inst = &CurBlock->back(); Inst;
+        Inst = Inst == &CurBlock->front() ? nullptr : Inst->getPrevNode()) {
+      // Process a two address instruction. (All two address instructions are
+      // intrinsics and thus calls.)
+      if (auto CI = dyn_cast<CallInst>(Inst)) {
+        int TwoAddrOperandNum = getTwoAddressOperandNum(CI);
+        if (TwoAddrOperandNum >= 0) {
+          processTwoAddrOrPhi(CI, TwoAddrOperandNum);
+          if (GenXIntrinsic::isWrRegion(CI))
+            processNonOverlappingRegion(CI);
+        }
+      }
+      // Mark the instruction as seen.
+      InstSeen.insert(Inst);
+    }
+    InstSeen.clear();
+    InstSeenInProcessNonOverlappingRegion.clear();
+    ReachabilityCache.clear();
+    for (auto i = ToErase.begin(), e = ToErase.end(); i != e; ++i)
+      (*i)->eraseFromParent();
+    ToErase.clear();
+  }
+
+  shortenLiveRanges(F);
+}
+
+/***********************************************************************
+ * shortenLiveRanges : hoist rdregions if this helps to avoid copy coalescing.
+ *
+ * %1 = wrregion ...
+ * ...
+ * %2 = wrregion(%1, ...)
+ * ...
+ * %3 = rdregion (%1, ...)
+ * no other uses of %1 except rdregions
+ *
+ * In this situation, compiler will do copy coalescing(See GenXCoalescing) %2
+ * from %1. If %1 is a big region, we will have a lot of movs. But if %3 reads
+ * a small region, it's cheaper to hoist it between %1 and %2. Compiler will
+ * generate a copy for this small region, but %2 will be coalesced without
+ * copying.
+ */
+void GenXUnbaling::shortenLiveRanges(Function *F) {
+  for (po_iterator<BasicBlock *> i = po_begin(&F->getEntryBlock()),
+                                 e = po_end(&F->getEntryBlock());
+       i != e; ++i) {
+    BasicBlock *BB = *i;
+    for (Instruction &Inst : *BB) {
+      auto DstRegion = dyn_cast<CallInst>(&Inst);
+      if (DstRegion && GenXIntrinsic::isWrRegion(DstRegion)) {
+        // now we've found %2 = wrregion. Firstly, let's check that %1 and %2
+        // interfere and after search for rdregions(%3 and others).
+        auto SrcRegion = dyn_cast<CallInst>(DstRegion->getOperand(0));
+        if (!SrcRegion || !GenXIntrinsic::isWrRegion(SrcRegion) ||
+            !interfere(SrcRegion, DstRegion))
+          continue;
+
+        // Collect all %1 users that are "under" %2.
+        unsigned DstNumber = Numbering->getNumber(DstRegion);
+        SmallVector<User *, 4> ToHoist;
+        std::copy_if(SrcRegion->user_begin(), SrcRegion->user_end(),
+                     std::back_inserter(ToHoist),
+                     [DstNumber, N = Numbering](User *U) {
+                       return DstNumber < N->getNumber(U);
+                     });
+        bool CanHoist =
+            std::all_of(ToHoist.begin(), ToHoist.end(), [BB](User *U) {
+              return U->isUsedInBasicBlock(BB) && GenXIntrinsic::isRdRegion(U);
+            });
+        if (!CanHoist || ToHoist.empty())
+          continue;
+
+        // Is it reasonable to hoist rdregions? Let's compare the number of
+        // elements to copy in both cases.
+        unsigned NumEltsToCopy = std::accumulate(
+            ToHoist.begin(), ToHoist.end(), 0u, [](unsigned Init, User *U) {
+              return Init + cast<VectorType>(U->getType())->getNumElements();
+            });
+        if (NumEltsToCopy >=
+            cast<VectorType>(SrcRegion->getType())->getNumElements())
+          continue;
+
+        // Unbale and hoist
+        for (User *U : ToHoist) {
+          auto RdR = dyn_cast<CallInst>(U);
+          assert(RdR && GenXIntrinsic::isRdRegion(RdR));
+          Instruction *InsertBefore = DstRegion;
+          if (auto UnbaleFrom = Baling->getBaleParent(RdR)) {
+            BaleInfo BI = Baling->getBaleInfo(UnbaleFrom);
+            BI.clearOperandBaled(RdR->use_begin()->getOperandNo());
+            Baling->setBaleInfo(UnbaleFrom, BI);
+          }
+          RdR->moveBefore(InsertBefore);
+          Modified = true;
+        }
+      }
+    }
+  }
+}
+
+bool GenXUnbaling::interfere(Value *V1, Value *V2) {
+  assert(V1);
+  assert(V2);
+
+  LiveRange *V1LR = Liveness->getLiveRangeOrNull(V1);
+  LiveRange *V2LR = Liveness->getLiveRangeOrNull(V2);
+  // We cannot analyze without LR.
+  if (!V1LR || !V2LR)
+    return false;
+  return Liveness->twoAddrInterfere(V1LR, V2LR);
+}
+
+/***********************************************************************
+ * processTwoAddrOrPhi : process a two address instruction or phi node
+ *    incoming
+ *
+ * Enter:   Inst = two address inst or phi node
+ *          TwoAddrOperandNum = two address operand number (incoming number
+ *                              for phi)
+ *
+ * For a phi node incoming, this is called when CurBlock and InstSeen reflect
+ * that processing has reached the end of the incoming's block, rather than the
+ * start of the block containing the phi node itself.
+ */
+void GenXUnbaling::processTwoAddrOrPhi(Instruction *Inst,
+    unsigned TwoAddrOperandNum) {
+  Value *TwoAddrOperand = Inst->getOperand(TwoAddrOperandNum);
+  if (isa<Constant>(TwoAddrOperand))
+    return;
+  LLVM_DEBUG(dbgs() << "\nGenXUnbaling::processTwoAddrOrPhi[" << TwoAddrOperandNum
+               << "]: " << *Inst << "\n");
+  if (!scanUsesForUnbaleAndMove(Inst, TwoAddrOperand))
+    return;
+  // Move the tree of bitcasts containing TwoAddrOperand to just after its def.
+  // (If that would be before a phi node, because the def is a phi node other
+  // than the last in its block, then insert just before first non-phi in the
+  // block. If the def is an Argument, insert at the start of the code.) We may
+  // need to move some of them earlier if their uses are going to be moved, and
+  // just moving them all as early as possible is easiest.  That does not
+  // affect register pressure or code size as a bitcast generates no code and
+  // is copy coalesced together.
+  //
+  // We do not worry about the possibility of moving the bitcasts into a join
+  // label block. Although a join label block must start with a join after the
+  // phi nodes, bitcasts are allowed as they generate no code.
+  Value *Root = TwoAddrOperand;
+  while (auto BC = dyn_cast<BitCastInst>(Root))
+    Root = BC->getOperand(0);
+  Value *V = Root;
+  Instruction *InsertBefore = nullptr;
+  if (auto I = dyn_cast<Instruction>(Root)) {
+    InsertBefore = I->getNextNode();
+    if (isa<PHINode>(InsertBefore))
+      InsertBefore = InsertBefore->getParent()->getFirstNonPHI();
+  } else
+    InsertBefore = Inst->getParent()->getParent()->front().getFirstNonPHI();
+  SmallVector<Instruction *, 4> BitCastQueue;
+  for (unsigned bci = 0;;) {
+    // For this value, find uses that are bitcast and save them.
+    for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui)
+      if (auto BC = dyn_cast<BitCastInst>(ui->getUser()))
+        BitCastQueue.push_back(BC);
+    // Go on to the next bitcast in the queue.
+    if (bci == BitCastQueue.size())
+      break;
+    auto BC = BitCastQueue[bci++];
+    // Move this bitcast.
+    if (BC == InsertBefore)
+      InsertBefore = BC->getNextNode();
+    else
+      BC->moveBefore(InsertBefore);
+    V = BC;
+  }
+  // Unbale and/or move uses found in scanUsesForUnbaleAndMove().
+  for (auto ti = ToUnbale.begin(), te = ToUnbale.end(); ti != te; ++ti) {
+    Instruction *Unbale = ti->Inst;
+    Instruction *InsertBefore = ti->InsertBefore;
+    LLVM_DEBUG(dbgs() << "Unbaling and/or moving " << Unbale->getName()
+                 << " (or removing if it is a duplicate)\n");
+    // Unbale from its bale parent (if any).
+    if (auto UnbaleFrom = Baling->getBaleParent(Unbale)) {
+      LLVM_DEBUG(dbgs() << "Unbaling " << Unbale->getName() << " from "
+                   << UnbaleFrom->getName() << " in bale "
+                   << Baling->getBaleHead(UnbaleFrom)->getName() << "\n");
+      BaleInfo BI = Baling->getBaleInfo(UnbaleFrom);
+      BI.clearOperandBaled(Unbale->use_begin()->getOperandNo());
+      Baling->setBaleInfo(UnbaleFrom, BI);
+    }
+    auto Found = CommonBaleMap.find(Unbale);
+    if (Found != CommonBaleMap.end()) {
+      LLVM_DEBUG(dbgs() << "Duplicate of " << Found->second->getName()
+                   << ", removing\n");
+      Unbale->replaceAllUsesWith(Found->second);
+      Bale B;
+      Baling->buildBale(Unbale, &B, /*IncludeAddr=*/true);
+      Liveness->removeBale(B);
+      B.eraseFromParent();
+    } else {
+      // Move it if necessary.
+      if (InsertBefore) {
+        LLVM_DEBUG(dbgs() << "Moving bale at " << Unbale->getName()
+            << " to before " << InsertBefore->getName()
+            << " in " << InsertBefore->getParent()->getName() << "\n");
+        Bale B;
+        Baling->buildBale(Unbale, &B, /*IncludeAddr=*/true);
+        for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) {
+          auto MoveInst = bi->Inst;
+          LLVM_DEBUG(dbgs() << "  moving " << MoveInst->getName() << "\n");
+          MoveInst->moveBefore(InsertBefore);
+        }
+      }
+    }
+  }
+  Modified = true;
+}
+
+/***********************************************************************
+ * scanUsesForUnbaleAndMove : scan uses of TwoAddrOperand to see if we can
+ *      unbale and/or move them to before the current position
+ *
+ * Enter:   Inst = instruction at current position
+ *          TwoAddrOperand : value whose uses we scan
+ *
+ * Return:  true if we want to unbale/move some uses
+ *
+ * This function clears then populates the following GenXUnbaling fields:
+ *
+ * ToUnbale = vector to store instructions that want to be unbaled and/or moved.
+ * CommonBaleMap = map to store mapping for common bales.
+ *
+ * A duplicate instruction is also in ToUnbale, but after the instruction it
+ * duplicates.
+ *
+ * The function spots the following cases (picking the first that applies):
+ *
+ * 1. All uses already before Inst. Returns false.
+ * 2. There is some use whose liveness reaches back to Inst, but is not
+ *    dominated by Inst, so we cannot do anything. Returns false.
+ * 3. There is some use in an instruction after Inst which we cannot unbale
+ *    and/or move so it is before Inst because it has an outside-bale operand
+ *    whose def is not before Inst. Returns false.
+ * 4. All uses after Inst can be unbaled and/or moved, but (after commoning
+ *    them up) that would result in a number of extra instructions that
+ *    outweights the number saved by failing to coalesce Inst. Returns false.
+ * 5. There is some use in an instruction after Inst that is not a rdregion
+ *    use. We cannot do anything with that. Returns false.
+ * 6. Otherwise, return true to tell the caller to go ahead and unbale/move
+ *    the instructions in ToUnbale (or common up with another one if it is
+ *    in CommonBaleMap).
+ *
+ * We also need to look at uses of a tree of bitcasts of TwoAddrOperand, as
+ * they will be copy coalesced.
+ */
+bool GenXUnbaling::scanUsesForUnbaleAndMove(Instruction *Inst,
+                                            Value *TwoAddrOperand) {
+  ToUnbale.clear();
+  CommonBaleMap.clear();
+  std::set<Instruction *> UseSeen;
+  std::set<Bale> CommonBales;
+  unsigned UnbaleCount = 0;
+  // Scan uses of TwoAddrOperand, and, if any use is a bitcast, scan its uses,
+  // and so on through the tree of bitcasts. If TwoAddrOperand is itself the
+  // result of a bitcast, scan up to the root of the bitcast tree first.
+  SmallVector<Instruction *, 4> BitCasts;
+  Value *Root = TwoAddrOperand;
+  while (auto BC = dyn_cast<BitCastInst>(Root))
+    Root = BC->getOperand(0);
+  for (unsigned bci = 0;;) {
+    for (auto ui = Root->use_begin(), ue = Root->use_end();
+        ui != ue; ++ui) {
+      auto User = cast<Instruction>(ui->getUser());
+      if (auto Phi = dyn_cast<PHINode>(User)) {
+        if (Phi == Inst)
+          continue; // Ignore use in phi node that we started at.
+        // For a phi node, determine the use's position relative to the current
+        // position as if it is at the end of the incoming block.
+        int Position = getReachability(
+            Phi->getIncomingBlock(*ui)->getTerminator(),
+            dyn_cast<Instruction>(TwoAddrOperand));
+        LLVM_DEBUG(dbgs() << "phi use in " << User->getName() << " is "
+            << (Position == BEFORE ? "before" : (Position == AFTER ? "after"
+                : (Position == REACHES ? "reaches" : (Position == NOTREACHES
+                    ? "notreaches" : "unknown")))) << "\n");
+        if (Position == BEFORE || Position == NOTREACHES)
+          continue;
+        return false;
+      }
+      auto UserHead = Baling->getBaleHead(User);
+      if (UserHead == Inst)
+        continue; // Ignore use in wrregion Inst that we started at.
+      LLVM_DEBUG(dbgs() << "use in " << *User << "\n");
+      if (!UseSeen.insert(User).second) {
+        LLVM_DEBUG(dbgs() << "use in " << User->getName()
+                     << " has already been accounted for\n");
+        continue;
+      }
+      // Determine the use's position relative to the current position. We use
+      // the bale head's position.
+      int Position =
+          getReachability(UserHead, dyn_cast<Instruction>(TwoAddrOperand));
+      LLVM_DEBUG(dbgs() << "use in " << User->getName() << " is "
+          << (Position == BEFORE ? "before" : (Position == AFTER ? "after"
+              : (Position == REACHES ? "reaches" : (Position == NOTREACHES
+                  ? "notreaches" : "unknown")))) << "\n");
+      if (Position == NOTREACHES)
+        continue; // ignore use unreachable from Inst
+      if (isa<BitCastInst>(User)) {
+        // This is a bitcast -- add it to BitCasts so we use it as a Root later
+        // and scan its uses (even if it is before Inst, as its uses might
+        // still be after Inst).
+        LLVM_DEBUG(dbgs() << "use in " << User->getName() << " is bitcast\n");
+        BitCasts.push_back(User);
+        continue;
+      }
+      if (Position == BEFORE)
+        continue; // Ignore use that is already before Inst.
+      // Check that the use is operand 0 of rdregion.
+      if (ui->getOperandNo() || !GenXIntrinsic::isRdRegion(User)) {
+        LLVM_DEBUG(dbgs() << "use in " << User->getName()
+                     << " is after but is not rdregion\n");
+        return false;
+      }
+      // If the result of the rdregion is too big (more than 32 elements or
+      // more than 2 GRFs), we cannot unbale it. This happens with an rdregion
+      // baled in to a raw operand of a shared function intrinsic. Unbaling it
+      // would result in an illegally wide instruction.
+      if (auto VT = dyn_cast<VectorType>(User->getType())) {
+        if (VT->getNumElements() > 32U
+            || VT->getPrimitiveSizeInBits() > 512U) {
+          LLVM_DEBUG(dbgs() << User->getName() << " is too wide to unbale\n");
+          return false;
+        }
+      }
+      // We have decided that this use needs unbaling and/or moving. Decide how
+      // we are going to do it, without actually doing it yet.  First assume
+      // that we're going to unbale User from its bale parent, if it is baled
+      // at all.
+      Instruction *Unbale = User;
+      Bale B;
+      if (GenXIntrinsic::isWrRegion(UserHead)) {
+        // The bale head is a wrregion. Unbale the main instruction from it,
+        // rather than just the user of the overlapping vector, as long as the
+        // resulting smaller bale contains at least two uses of TwoAddrOperand
+        // (or a bitcast thereof), and each outside-bale operand in the bale is
+        // defined before Inst.
+        Unbale = dyn_cast<Instruction>(
+            UserHead->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+        if (Unbale) {
+          // We use IncludeAddr=true on the buildBale. That makes it include
+          // any address calculation (convert.addr and add.addr ops), even
+          // though they are not baled in. What that gives us is:
+          //
+          // 1. When comparing bales in the CommonUses set to find another bale
+          //    that we can common up with, it makes two rdregions look the
+          //    same even though they have separate copies of their address
+          //    calculation.
+          //
+          // 2. The code here that checks if all the outside-bale operands are
+          //    defined early enough and then moves the bale also moves the
+          //    address calculation, which is what we want.
+          Baling->buildBale(Unbale, &B, /*IncludeAddr=*/true);
+          B.hash();
+          LLVM_DEBUG(B.print(dbgs()));
+          // Check for multiple uses. (A use is always in operand 0 of
+          // rdregion.)
+          unsigned UseCount = 0;
+          for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) {
+            if (bi->Info.Type != BaleInfo::RDREGION)
+              continue;
+            Value *Opnd = bi->Inst->getOperand(0);
+            if (Opnd == Root)
+              ++UseCount;
+            else
+              for (auto ri = BitCasts.begin(),
+                        re = BitCasts.end(); ri != re; ++ri)
+                if (bi->Inst->getOperand(0) == *ri)
+                  ++UseCount;
+          }
+          assert(UseCount >= 1);
+          if (UseCount <= 1) {
+            // Did not get multiple uses. Just unbale the rdregion use.
+            if (Unbale != User) {
+              B.clear();
+              Unbale = User;
+            }
+          } else {
+            LLVM_DEBUG(dbgs() << "Trying unbale from wrregion\n");
+            if (!UseSeen.insert(Unbale).second) {
+              LLVM_DEBUG(dbgs() << "use (unbale from wrregion) in "
+                           << User->getName()
+                           << " has already been accounted for\n");
+              continue;
+            }
+          }
+        }
+      }
+      if (!Unbale)
+        return false;
+      // Loop to try unbaling from wrregion first, then try just unbaling the
+      // rdregion.
+      Instruction *InsertBefore = nullptr; // start assuming not moving sub-bale
+      for (;;) {
+        // Build the sub-bale we are proposing to unbale (if not already built
+        // in the code above). See comment above about using IncludeAddr=true.
+        if (B.empty()) {
+          Baling->buildBale(Unbale, &B, /*IncludeAddr=*/true);
+          B.hash();
+          LLVM_DEBUG(B.print(dbgs()));
+        }
+        // Get the position relative to Inst of the sub-bale we propose to
+        // unbale. If it is already BEFORE, then we don't need to check for all
+        // outside-bale operands being before Inst.
+        int UnbalePos = getReachability(Unbale,
+              dyn_cast<Instruction>(TwoAddrOperand));
+        LLVM_DEBUG(dbgs() << "proposed unbale " << Unbale->getName() << " is "
+            << (Position == BEFORE ? "before" : (Position == AFTER ? "after"
+                : (Position == REACHES ? "reaches" : (Position == NOTREACHES
+                    ? "notreaches" : "unknown")))) << "\n");
+        if (UnbalePos == BEFORE) {
+          InsertBefore = nullptr; // no need to move instruction
+          break; // ok to unbale here
+        }
+        // We need to move the unbaled instruction. Work out where we need to
+        // move it to.
+        if (UnbalePos == AFTER && !isa<PHINode>(Inst))
+          InsertBefore = Inst; // insert before original two addr inst
+        else {
+          // The instruction to be unbaled is not dominated by the original two
+          // addr inst, or we were processing a phi incoming rather than a two
+          // addr inst. We want to find the nearest common dominator and insert
+          // at the end of that block.
+          InsertBefore = DT->findNearestCommonDominator(
+                CurBlock, Unbale->getParent())->getTerminator();
+          // Ensure we have a legal insertion point in the presence of SIMD CF.
+          InsertBefore = GotoJoin::getLegalInsertionPoint(InsertBefore, DT);
+        }
+        // We will need to move the unbaled instruction to before Inst.  Check
+        // that each outside-bale operand in the bale is defined before the
+        // insert point.
+        bool IsBeforeInst = true;
+        for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) {
+          for (unsigned oi = 0, oe = bi->Inst->getNumOperands();
+              oi != oe && IsBeforeInst; ++oi) {
+            if (!bi->Info.isOperandBaled(oi)) {
+              auto Opnd = bi->Inst->getOperand(oi);
+              // Check for Opnd's definition being before the insert point:
+              //
+              // 1. If it is an Argument rather than an Instruction, it is
+              //    before.
+              if (auto OpndInst = dyn_cast<Instruction>(Opnd)) {
+                // 2. If in same basic block:
+                // 2a. If insert point is Inst (the original two addr inst),
+                //     use InstSeen to work out if it is before or after.
+                // 2b. Otherwise, it is always before because InsertBefore is
+                //     at the end of its basic block.
+                if (OpndInst->getParent() == InsertBefore->getParent()) {
+                  if (InsertBefore == Inst)
+                    IsBeforeInst &= OpndInst != Inst
+                        && InstSeen.find(OpndInst) == InstSeen.end();
+                } else
+                  // 3. If in different basic block, check dominance.
+                  IsBeforeInst &= DT->dominates(
+                      OpndInst->getParent(), InsertBefore->getParent());
+              }
+              if (!IsBeforeInst) {
+                LLVM_DEBUG(dbgs() << "  outside-bale operand " << Opnd->getName()
+                    << " is not before Inst\n");
+                break;
+              }
+            }
+          }
+        }
+        if (IsBeforeInst) {
+          // OK to unbale and move to InsertBefore.
+          break;
+        }
+        // We have failed, either by Unbale's position being REACHES so we
+        // can't move it, or by its position being AFTER so we need to move it
+        // but there is an outside-bale operand that is not before Inst.
+        if (Unbale != User) {
+          // This is the case that we were trying to unbale out of the
+          // wrregion. This has now failed, and we re-try unbaling just the
+          // rdregion use.
+          LLVM_DEBUG(dbgs() << "Failed to unbale out of wrregion; "
+                       << "retrying at rdregion\n");
+          Unbale = User;
+          B.clear();
+          continue;
+        }
+        // We have found an outside-bale operand that is not defined before
+        // Inst, presumably an operand to the address calculation of the
+        // rdregion.  We have to give up at this point.
+        LLVM_DEBUG(dbgs() << "Failed to unbale rdregion; abandon\n");
+        return false;
+      }
+      LLVM_DEBUG(dbgs() << "Can unbale and/or move\n");
+      // See if we already have a common bale. If so, point this use at it.
+      auto Found = CommonBales.find(B);
+      if (Found != CommonBales.end()) {
+        LLVM_DEBUG(dbgs() << "Found common bale "
+                     << Found->getHead()->Inst->getName() << "\n");
+        CommonBaleMap[Unbale] = Found->getHead()->Inst;
+      } else {
+        CommonBales.insert(B);
+        // If there will actually be an unbale, count it.
+        UnbaleCount += Baling->isBaled(Unbale);
+      }
+      // Add this bale to the list of bales to unbale and/or move.
+      LLVM_DEBUG(
+        if (!InsertBefore)
+          dbgs() << "Adding " << Unbale->getName() << " to ToUnbale list\n";
+        else
+          dbgs() << "Adding " << Unbale->getName() << " (with move to before "
+              << InsertBefore->getName() << " in "
+              << InsertBefore->getParent()->getName() << ") to Unbale list\n";
+      );
+      ToUnbale.push_back(ToUnbaleEntry(Unbale, InsertBefore));
+    }
+    // Also look at uses of bitcasts in the bitcast tree.
+    if (bci == BitCasts.size())
+      break;
+    Root = BitCasts[bci++];
+  }
+  if (ToUnbale.empty()) {
+    LLVM_DEBUG(dbgs() << "Nothing to unbale/move, "
+                 << "must already be kill use at Inst\n");
+    return false;
+  }
+  // Calculate how many instructions would be needed for the copy caused by
+  // TwoAddrOperand failing to coalesce with Inst, and compare that with the
+  // number of extra instructions caused by the unbaling that we propose to do
+  // to avoid it.
+  unsigned NumBytes = TwoAddrOperand->getType()->getPrimitiveSizeInBits() / 8U;
+  unsigned NumCopies = NumBytes / 64U; // one copy per 2 GRFs
+  NumBytes -= NumCopies * 64U;
+  NumCopies += countPopulation(NumBytes); // extra copy per power of 2
+  LLVM_DEBUG(dbgs() << NumCopies << " copy insts, vs "
+               << UnbaleCount << " unbales\n");
+  if (NumCopies < UnbaleCount) {
+    LLVM_DEBUG(dbgs() << "Too many new instructions, code would be worse.\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "We have uses to unbale/move.\n");
+  return true;
+}
+
+/***********************************************************************
+ * getReachability : determine relationship of Inst with current position
+ *
+ * Enter:   Inst = instruction to get position of
+ *          Def = 0 else instruction that defines use whose liveness we are
+ *                interested in
+ *
+ * Return:  BEFORE: Inst is before current pos (Inst dominates current pos)
+ *          AFTER: Inst is after current pos (current pos dominates Inst)
+ *          REACHES: no dominance, and liveness of use in Inst reaches back to
+ *              current pos without passing through Def
+ *          NOTREACHES: no dominance, and liveness of use in Inst does not reach
+ *              back to current pos without passing through Def
+ *
+ * In the case that there is no simple dominance relationship between Inst and
+ * the current position, Def is used to stop the backwards scan. For a value
+ * defined inside a loop, if you don't supply def then this function will
+ * always return REACHES as it will trace backwards round the loop.
+ *
+ * The current position is represented by CurBlock and which already seen
+ * instructions in that block are in InstSeen.
+ *
+ * We keep a cache of results. This is cleared when the current basic block
+ * changes.
+ */
+int GenXUnbaling::getReachability(Instruction *Inst, Instruction *Def)
+{
+  auto Block = Inst->getParent();
+  // Check simple case of same basic block.
+  if (CurBlock == Block)
+    return InstSeen.find(Inst) != InstSeen.end() ? AFTER : BEFORE;
+  // Check ReachabilityCache.
+  auto It = ReachabilityCache.insert(
+      std::pair<BasicBlock *, int>(Block, UNKNOWN)).first;
+  if (It->second != UNKNOWN)
+    return It->second;
+  // Check dominance.
+  if (DT->dominates(Block, CurBlock))
+    return It->second = BEFORE;
+  if (DT->dominates(CurBlock, Block))
+    return It->second = AFTER;
+  // Trace liveness of use in Inst backwards and see if we reach CurBlock.
+  BasicBlock *DefBlock = nullptr;
+  if (Def)
+    DefBlock = Def->getParent();
+  SmallVector<BasicBlock *, 4> Stack;
+  std::set<BasicBlock *> BlockSeen;
+  Stack.push_back(Block);
+  while (!Stack.empty()) {
+    Block = Stack.back();
+    Stack.pop_back();
+    if (!BlockSeen.insert(Block).second)
+      continue; // already seen, terminate this branch of the scan
+    if (Block == CurBlock)
+      return It->second = REACHES; // reached current pos
+    if (Block == DefBlock)
+      continue; // reached def, terminate this branch of the scan
+    // Add the predecessors of this block to the stack.
+    for (auto ui = Block->use_begin(), ue = Block->use_end(); ui != ue; ++ui)
+      Stack.push_back(cast<Instruction>(ui->getUser())->getParent());
+  }
+  return It->second = NOTREACHES;
+}
+
+/***********************************************************************
+ * processNonOverlappingRegion : perform the non-overlapping region optimization
+ *
+ * Enter:   EndWr = wrregion instruction for possible end of wrregion sequence
+ *
+ * If EndWr is head of a bale that includes a rdregion, and it is part of a
+ * sequence of wrregions whose first "old value" input is the same as the input
+ * to the rdregion, then check whether the rdregion's region has been
+ * overwritten in the sequence. If not, change the rdregion's input to the same
+ * as that of Wr.
+ *
+ * The idea is that we can avoid overlapping live ranges and hence unbaling.
+ *
+ * This also handles the case that the "old value" input to the start wrregion
+ * is undef, and we want to make the transformation (and change that start
+ * wrregion input too) to save a live range overlap in the sequence. However,
+ * we only do that if we can prove that it does not make the code worse, which
+ * it does if the rdregion input is still live after the sequence.
+ */
+void GenXUnbaling::processNonOverlappingRegion(CallInst *EndWr)
+{
+  // Avoid processing a sequence of N wrregions N times, giving O(N^2)
+  // complexity -- only process when we see the end of the sequence.
+  if (InstSeenInProcessNonOverlappingRegion.find(EndWr)
+      != InstSeenInProcessNonOverlappingRegion.end())
+    return;
+  // Find the sequence of wrregions, each except the last having the next as
+  // its only use.
+  CallInst *StartWr = EndWr;
+  Value *StartWrInput = nullptr;
+  bool WrVariableIndex = false;
+  for (;;) {
+    WrVariableIndex |=!isa<Constant>(
+          StartWr->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum));
+    StartWrInput =
+        StartWr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+    if (!GenXIntrinsic::isWrRegion(StartWrInput))
+      break;
+    if (!StartWrInput->hasOneUse())
+      break;
+    StartWr = cast<CallInst>(StartWrInput);
+    InstSeenInProcessNonOverlappingRegion[StartWr] = true;
+  }
+  if (StartWr == EndWr)
+    return; // no sequence
+  if (WrVariableIndex)
+    return; // Can't deal with variable index
+  Value *RdInput = StartWrInput;
+  if (isa<UndefValue>(StartWrInput)) {
+    // In the case that the input to the start wrregion is undef, we need to
+    // find a rdregion input that is the same type.
+    RdInput = nullptr;
+    Bale B;
+    Baling->buildBale(StartWr, &B);
+    for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) {
+      if (bi->Info.Type != BaleInfo::RDREGION)
+        continue;
+      Value *Input = bi->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+      if (Input->getType() != StartWrInput->getType())
+        continue;
+      RdInput = Input;
+      if (isa<PHINode>(Input)) {
+        // Prefer to save a live-range on Phi, which may help to
+        // save phi copies. This is observed on Histogram1.
+        break;
+      }
+    }
+    if (!RdInput)
+      return; // no such input found
+    // We need to check that RdInput is not used again after this sequence,
+    // otherwise we could be making the code worse. The use of RdInput is
+    // counted as being at its user's bale head.
+    auto Def = dyn_cast<Instruction>(RdInput);
+    for (auto ui = RdInput->use_begin(), ue = RdInput->use_end();
+        ui != ue; ++ui) {
+      auto User = cast<Instruction>(ui->getUser());
+      auto UserHead = Baling->getBaleHead(User);
+      switch (getReachability(UserHead, Def)) {
+        case AFTER:
+        case REACHES:
+          return;
+      }
+    }
+  }
+  // Scan forwards through the wrregion sequence, keeping track of which
+  // elements of the vector keep their original values. Then for each one see
+  // if it has a rdregion whose input is the same as the first wrregion's "old
+  // value" input. If so, and the region has not been overwritten by wrregions
+  // so far, remember it as one that we want to change.  We calculate which
+  // regions have been overwritten by starting with a vector of all 0s and then
+  // simulating the writes by writing -1s. If the region we want at the end is
+  // still all 0s, then it has not been overwritten.
+  SmallVector<std::pair<Instruction *, Value *>, 4> RdsToModify;
+  Constant *C = Constant::getNullValue(EndWr->getType());
+  for (auto ThisWr = StartWr;;) {
+    // For elements overwritten by Wr, change corresponding elements in C to
+    // undef.
+    Region R(ThisWr, BaleInfo());
+    C = R.evaluateConstantWrRegion(C,
+        Constant::getAllOnesValue(ThisWr->getOperand(1)->getType()));
+    // Move on to next wrregion.
+    if (ThisWr == EndWr)
+      break;
+    ThisWr = cast<CallInst>(ThisWr->use_begin()->getUser());
+    // Scan the rdregions in ThisWr's bale.
+    Bale B;
+    Baling->buildBale(ThisWr, &B);
+    for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) {
+      if (bi->Info.Type != BaleInfo::RDREGION)
+        continue;
+      if (bi->Inst->getOperand(0) != RdInput)
+        continue;
+      Instruction *Rd = bi->Inst;
+      // See if the rdregion only reads a region that has not been overwritten
+      // by any wrregion up to now.
+      Region RdR(Rd, BaleInfo());
+      if (RdR.Indirect)
+        return; // Fail if rdregion is indirect
+      Constant *SubC = RdR.evaluateConstantRdRegion(C, /*AllowScalar=*/false);
+      if (!SubC->isNullValue())
+        return; // Fail if reads overwritten region
+      // Remember this rdregion for modifying.
+      RdsToModify.push_back(
+          std::pair<Instruction *, Value *>(Rd, ThisWr->getOperand(0)));
+    }
+  }
+  // No failures, so do the modification.
+  if (RdsToModify.empty())
+    return;
+  Modified = true;
+  SmallVector<Instruction *, 4> UselessWrRegions;
+  for (auto ri = RdsToModify.begin(), re = RdsToModify.end(); ri != re; ++ri) {
+    // Change the input to the rdregion.
+    auto Rd = ri->first;
+    auto RdInput = ri->second;
+    Rd->setOperand(0, RdInput);
+    // Check for the case that we have a rdregion-wrregion bale that is now
+    // uesless because it reads and writes the same region.
+    auto Wr = Baling->getBaleParent(Rd);
+    if (GenXIntrinsic::isWrRegion(Wr)
+        && Region(Wr, BaleInfo()) == Region(Rd, BaleInfo())) {
+      UselessWrRegions.push_back(Wr);
+      continue;
+    }
+    // We already know that the rdregion's position in generated code (as
+    // reflected by the order of heads of bales) is after the instruction
+    // generating its new input. However, ignoring baling, it might actually be
+    // _before_ that instruction in the IR, which causes the verifier pass to
+    // complain. We work around that by moving the rdregion (and any other
+    // instruction in the bale between it and the head) to just before the head
+    // of its bale.
+    SmallVector<Instruction *, 4> BaleTrace;
+    BaleTrace.push_back(Rd);
+    for (;;) {
+      auto Parent = Baling->getBaleParent(BaleTrace.back());
+      if (!Parent)
+        break;
+      BaleTrace.push_back(Parent);
+    }
+    for (unsigned i = 0, e = BaleTrace.size() - 1; i != e; ++i) {
+      auto InstToMove = BaleTrace[i];
+      InstToMove->moveBefore(BaleTrace.back());
+    }
+  }
+  // For the undef input case, also modify that.
+  if (isa<UndefValue>(StartWrInput))
+    StartWr->setOperand(0, RdInput);
+  // Now remove the useless wrregions found above.
+  for (auto i = UselessWrRegions.begin(), e = UselessWrRegions.end();
+      i != e; ++i) {
+    auto Wr = *i;
+    auto Rd = cast<Instruction>(
+        Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum));
+    Wr->replaceAllUsesWith(
+        Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum));
+    Liveness->removeValue(Wr);
+    Liveness->removeValue(Rd);
+    ToErase.push_back(Wr);
+    ToErase.push_back(Rd);
+  }
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.cpp
new file mode 100644
index 000000000000..eb7dab05963a
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.cpp
@@ -0,0 +1,1446 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// Utility functions for the GenX backend.
+//
+//===----------------------------------------------------------------------===//
+#include "GenXUtil.h"
+#include "FunctionGroup.h"
+#include "GenXIntrinsics.h"
+#include "GenXRegion.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+
+#include <iterator>
+
+using namespace llvm;
+using namespace genx;
+
+namespace {
+struct InstScanner {
+  Instruction *Original;
+  Instruction *Current;
+  InstScanner(Instruction *Inst) : Original(Inst), Current(Inst) {}
+};
+
+} // namespace
+
+/***********************************************************************
+ * createConvert : create a genx_convert intrinsic call
+ *
+ * Enter:   In = value to convert
+ *          Name = name to give convert instruction
+ *          InsertBefore = instruction to insert before else 0
+ *          M = Module (can be 0 as long as InsertBefore is not 0)
+ */
+CallInst *genx::createConvert(Value *In, const Twine &Name,
+    Instruction *InsertBefore, Module *M)
+{
+  if (!M)
+    M = InsertBefore->getParent()->getParent()->getParent();
+  Function *Decl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_convert,
+      In->getType());
+  return CallInst::Create(Decl, In, Name, InsertBefore);
+}
+
+/***********************************************************************
+ * createConvertAddr : create a genx_convert_addr intrinsic call
+ *
+ * Enter:   In = value to convert
+ *          Offset = constant offset
+ *          Name = name to give convert instruction
+ *          InsertBefore = instruction to insert before else 0
+ *          M = Module (can be 0 as long as InsertBefore is not 0)
+ */
+CallInst *genx::createConvertAddr(Value *In, int Offset, const Twine &Name,
+    Instruction *InsertBefore, Module *M)
+{
+  if (!M)
+    M = InsertBefore->getParent()->getParent()->getParent();
+  auto OffsetVal = ConstantInt::get(In->getType()->getScalarType(), Offset);
+  Function *Decl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_convert_addr,
+      In->getType());
+  Value *Args[] = { In, OffsetVal };
+  return CallInst::Create(Decl, Args, Name, InsertBefore);
+}
+
+/***********************************************************************
+ * createAddAddr : create a genx_add_addr intrinsic call
+ *
+ * InsertBefore can be 0 so the new instruction is not inserted anywhere,
+ * but in that case M must be non-0 and set to the Module.
+ */
+CallInst *genx::createAddAddr(Value *Lhs, Value *Rhs, const Twine &Name,
+    Instruction *InsertBefore, Module *M)
+{
+  if (!M)
+    M = InsertBefore->getParent()->getParent()->getParent();
+  Value *Args[] = {Lhs, Rhs};
+  Type *Tys[] = {Rhs->getType(), Lhs->getType()};
+  Function *Decl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_add_addr, Tys);
+  return CallInst::Create(Decl, Args, Name, InsertBefore);
+}
+
+/***********************************************************************
+ * createUnifiedRet : create a dummy instruction that produces dummy
+ * unified return value.
+ *
+ * %Name.unifiedret = call Ty @llvm.ssa_copy(Ty undef)
+ */
+CallInst *genx::createUnifiedRet(Type *Ty, const Twine &Name, Module *M) {
+  assert(Ty && M && "wrong arguments");
+  auto G = Intrinsic::getDeclaration(M, Intrinsic::ssa_copy, Ty);
+  return CallInst::Create(G, UndefValue::get(Ty), Name + ".unifiedret",
+                          static_cast<Instruction *>(nullptr));
+}
+
+/***********************************************************************
+ * getPredicateConstantAsInt : get an i1 or vXi1 constant's value as a single integer
+ */
+unsigned genx::getPredicateConstantAsInt(Constant *C)
+{
+  if (auto CI = dyn_cast<ConstantInt>(C))
+    return CI->getZExtValue(); // scalar
+  unsigned Bits = 0;
+  unsigned NumElements = cast<VectorType>(C->getType())->getNumElements();
+  for (unsigned i = 0; i != NumElements; ++i) {
+    auto El = C->getAggregateElement(i);
+    if (!isa<UndefValue>(El))
+      Bits |= (cast<ConstantInt>(El)->getZExtValue() & 1) << i;
+  }
+  return Bits;
+}
+
+/***********************************************************************
+ * getConstantSubvector : get a contiguous region from a vector constant
+ */
+Constant *genx::getConstantSubvector(Constant *V,
+    unsigned StartIdx, unsigned Size)
+{
+  Type *ElTy = cast<VectorType>(V->getType())->getElementType();
+  Type *RegionTy = VectorType::get(ElTy, Size);
+  if (isa<UndefValue>(V))
+    V = UndefValue::get(RegionTy);
+  else if (isa<ConstantAggregateZero>(V))
+    V = ConstantAggregateZero::get(RegionTy);
+  else {
+    SmallVector<Constant *, 32> Val;
+    for (unsigned i = 0; i != Size; ++i)
+      Val.push_back(V->getAggregateElement(i + StartIdx));
+    V = ConstantVector::get(Val);
+  }
+  return V;
+}
+
+/***********************************************************************
+ * concatConstants : concatenate two possibly vector constants, giving a
+ *      vector constant
+ */
+Constant *genx::concatConstants(Constant *C1, Constant *C2)
+{
+  assert(C1->getType()->getScalarType() == C2->getType()->getScalarType());
+  Constant *CC[] = { C1, C2 };
+  SmallVector<Constant *, 8> Vec;
+  bool AllUndef = true;
+  for (unsigned Idx = 0; Idx != 2; ++Idx) {
+    Constant *C = CC[Idx];
+    if (auto VT = dyn_cast<VectorType>(C->getType())) {
+      for (unsigned i = 0, e = VT->getNumElements(); i != e; ++i) {
+        Constant *El = C->getAggregateElement(i);
+        Vec.push_back(El);
+        AllUndef &= isa<UndefValue>(El);
+      }
+    } else {
+      Vec.push_back(C);
+      AllUndef &= isa<UndefValue>(C);
+    }
+  }
+  auto Res = ConstantVector::get(Vec);
+  if (AllUndef)
+    Res = UndefValue::get(Res->getType());
+  return Res;
+}
+
+/***********************************************************************
+ * findClosestCommonDominator : find closest common dominator of some instructions
+ *
+ * Enter:   DT = dominator tree
+ *          Insts = the instructions
+ *
+ * Return:  The one instruction that dominates all the others, if any.
+ *          Otherwise the terminator of the closest common dominating basic
+ *          block.
+ */
+Instruction *genx::findClosestCommonDominator(DominatorTree *DT,
+    ArrayRef<Instruction *> Insts)
+{
+  assert(!Insts.empty());
+  SmallVector<InstScanner, 8> InstScanners;
+  // Find the closest common dominating basic block.
+  Instruction *Inst0 = Insts[0];
+  BasicBlock *NCD = Inst0->getParent();
+  InstScanners.push_back(InstScanner(Inst0));
+  for (unsigned ii = 1, ie = Insts.size(); ii != ie; ++ii) {
+    Instruction *Inst = Insts[ii];
+    if (Inst->getParent() != NCD) {
+      auto NewNCD = DT->findNearestCommonDominator(NCD, Inst->getParent());
+      if (NewNCD != NCD)
+        InstScanners.clear();
+      NCD = NewNCD;
+    }
+    if (NCD == Inst->getParent())
+      InstScanners.push_back(Inst);
+  }
+  // Now we have NCD = the closest common dominating basic block, and
+  // InstScanners populated with the instructions from Insts that are
+  // in that block.
+  if (InstScanners.empty()) {
+    // No instructions in that block. Return the block's terminator.
+    return NCD->getTerminator();
+  }
+  if (InstScanners.size() == 1) {
+    // Only one instruction in that block. Return it.
+    return InstScanners[0].Original;
+  }
+  // Create a set of the original instructions.
+  std::set<Instruction *> OrigInsts;
+  for (auto i = InstScanners.begin(), e = InstScanners.end(); i != e; ++i)
+    OrigInsts.insert(i->Original);
+  // Scan back one instruction at a time for each scanner. If a scanner reaches
+  // another original instruction, the scanner can be removed, and when we are
+  // left with one scanner, that must be the earliest of the original
+  // instructions.  If a scanner reaches the start of the basic block, that was
+  // the earliest of the original instructions.
+  //
+  // In the worst case, this algorithm could scan all the instructions in a
+  // basic block, but it is designed to be better than that in the common case
+  // that the original instructions are close to each other.
+  for (;;) {
+    for (auto i = InstScanners.begin(), e = InstScanners.end(); i != e; ++i) {
+      if (i->Current == &i->Current->getParent()->front())
+        return i->Original; // reached start of basic block
+      i->Current = i->Current->getPrevNode();
+      if (OrigInsts.find(i->Current) != OrigInsts.end()) {
+        // Scanned back to another instruction in our original set. Remove
+        // this scanner.
+        *i = InstScanners.back();
+        InstScanners.pop_back();
+        if (InstScanners.size() == 1)
+          return InstScanners[0].Original; // only one scanner left
+        break; // restart loop so as not to confuse the iterator
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * getTwoAddressOperandNum : get operand number of two address operand
+ *
+ * If an intrinsic has a "two address operand", then that operand must be
+ * in the same register as the result. This function returns the operand number
+ * of the two address operand if any, or -1 if not.
+ */
+int genx::getTwoAddressOperandNum(CallInst *CI)
+{
+  auto IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(CI);
+  if (IntrinsicID == GenXIntrinsic::not_any_intrinsic)
+    return -1; // not intrinsic
+  if (GenXIntrinsic::isWrRegion(IntrinsicID) ||
+      IntrinsicID == GenXIntrinsic::genx_wrpredregion ||
+      IntrinsicID == GenXIntrinsic::genx_wrpredpredregion)
+    return 0; // wr(pred(pred))region has operand 0 as two address operand
+  if (CI->getType()->isVoidTy())
+    return -1; // no return value
+  GenXIntrinsicInfo II(IntrinsicID);
+  unsigned Num = CI->getNumArgOperands();
+  if (!Num)
+    return -1; // no args
+  --Num; // Num = last arg number, could be two address operand
+  if (isa<UndefValue>(CI->getOperand(Num)))
+    return -1; // operand is undef, must be RAW_NULLALLOWED
+  if (II.getArgInfo(Num).getCategory() != GenXIntrinsicInfo::TWOADDR)
+    return -1; // not two addr operand
+  if (CI->use_empty() && II.getRetInfo().rawNullAllowed())
+    return -1; // unused result will be V0
+  return Num; // it is two addr
+}
+
+/***********************************************************************
+ * isNot : test whether an instruction is a "not" instruction (an xor with
+ *    constant all ones)
+ */
+bool genx::isNot(Instruction *Inst)
+{
+  if (Inst->getOpcode() == Instruction::Xor)
+    if (auto C = dyn_cast<Constant>(Inst->getOperand(1)))
+      if (C->isAllOnesValue())
+        return true;
+  return false;
+}
+
+/***********************************************************************
+ * isPredNot : test whether an instruction is a "not" instruction (an xor
+ *    with constant all ones) with predicate (i1 or vector of i1) type
+ */
+bool genx::isPredNot(Instruction *Inst)
+{
+  if (Inst->getOpcode() == Instruction::Xor)
+    if (auto C = dyn_cast<Constant>(Inst->getOperand(1)))
+      if (C->isAllOnesValue() && C->getType()->getScalarType()->isIntegerTy(1))
+        return true;
+  return false;
+}
+
+/***********************************************************************
+ * isIntNot : test whether an instruction is a "not" instruction (an xor
+ *    with constant all ones) with non-predicate type
+ */
+bool genx::isIntNot(Instruction *Inst)
+{
+  if (Inst->getOpcode() == Instruction::Xor)
+    if (auto C = dyn_cast<Constant>(Inst->getOperand(1)))
+      if (C->isAllOnesValue() && !C->getType()->getScalarType()->isIntegerTy(1))
+        return true;
+  return false;
+}
+
+/***********************************************************************
+ * ShuffleVectorAnalyzer::getAsSlice : see if the shufflevector is a slice on
+ *    operand 0, and if so return the start index, or -1 if it is not a slice
+ */
+int ShuffleVectorAnalyzer::getAsSlice()
+{
+  unsigned WholeWidth = SI->getOperand(0)->getType()->getVectorNumElements();
+  Constant *Selector = cast<Constant>(SI->getOperand(2));
+  unsigned Width = SI->getType()->getVectorNumElements();
+  unsigned StartIdx = cast<ConstantInt>(
+      Selector->getAggregateElement((unsigned)0))->getZExtValue();
+  if (StartIdx >= WholeWidth)
+    return -1; // start index beyond operand 0
+  unsigned SliceWidth;
+  for (SliceWidth = 1; SliceWidth != Width; ++SliceWidth) {
+    auto CI = dyn_cast<ConstantInt>(Selector->getAggregateElement(SliceWidth));
+    if (!CI)
+      break;
+    if (CI->getZExtValue() != StartIdx + SliceWidth)
+      return -1; // not slice
+  }
+  return StartIdx;
+}
+
+/***********************************************************************
+ * ShuffleVectorAnalyzer::isReplicatedSlice : check if the shufflevector
+ * is a replicated slice on operand 0.
+ */
+bool ShuffleVectorAnalyzer::isReplicatedSlice() const {
+  const auto MaskVals = SI->getShuffleMask();
+  auto Begin = MaskVals.begin();
+  auto End = MaskVals.end();
+
+  // Check for undefs.
+  if (std::find(Begin, End, -1) != End)
+    return false;
+
+  if (MaskVals.size() == 1)
+    return true;
+
+  // Slice should not touch second operand.
+  auto MaxIndex = static_cast<size_t>(MaskVals.back());
+  if (MaxIndex >= SI->getOperand(0)->getType()->getVectorNumElements())
+    return false;
+
+  // Find first non-one difference.
+  auto SliceEnd =
+      std::adjacent_find(Begin, End,
+                         [](int Prev, int Next) { return Next - Prev != 1; });
+  // If not found, then it is simple slice.
+  if (SliceEnd == End)
+    return true;
+
+  // Compare slice with parts of sequence to prove that it is periodic.
+  ++SliceEnd;
+  unsigned SliceSize = std::distance(Begin, SliceEnd);
+  // Slice should be replicated.
+  if (MaskVals.size() % SliceSize != 0)
+    return false;
+
+  for (auto It = SliceEnd; It != End; std::advance(It, SliceSize))
+    if (!std::equal(Begin, SliceEnd, It))
+      return false;
+
+  return true;
+}
+
+// Based on the value of a shufflevector mask element defines in which of
+// 2 operands it points. The operand is returned.
+static Value *getOperandByMaskValue(const ShuffleVectorInst &SI,
+                                    int MaskValue) {
+  assert(MaskValue >= 0 && "invalid index");
+  int FirstOpSize = SI.getOperand(0)->getType()->getVectorNumElements();
+  if (MaskValue < FirstOpSize)
+    return SI.getOperand(0);
+  else {
+    int SecondOpSize = SI.getOperand(1)->getType()->getVectorNumElements();
+    assert(MaskValue < FirstOpSize + SecondOpSize && "invalid index");
+    return SI.getOperand(1);
+  }
+}
+
+// safe advance
+// If adding \p N results in bound violation, \p Last is written to \p It
+template <typename Iter> void advanceSafe(Iter &It, Iter Last, int N) {
+  if (N > std::distance(It, Last)) {
+    It = Last;
+    return;
+  }
+  std::advance(It, N);
+}
+
+// Returns operand and its region of 1 element that is referenced by
+// \p MaskVal element of shufflevector mask.
+static ShuffleVectorAnalyzer::OperandRegionInfo
+matchOneElemRegion(const ShuffleVectorInst &SI, int MaskVal) {
+  ShuffleVectorAnalyzer::OperandRegionInfo Init;
+  Init.Op = getOperandByMaskValue(SI, MaskVal);
+  Init.R = Region(Init.Op);
+  Init.R.NumElements = Init.R.Width = 1;
+  if (Init.Op == SI.getOperand(0))
+    Init.R.Offset = MaskVal * Init.R.ElementBytes;
+  else {
+    auto FirstOpSize = SI.getOperand(0)->getType()->getVectorNumElements();
+    Init.R.Offset = (MaskVal - FirstOpSize) * Init.R.ElementBytes;
+  }
+  return Init;
+}
+
+// Takes shufflevector mask indexes from [\p FirstIt, \p LastIt),
+// converts them to the indexes of \p Operand of \p SI instruction
+// and writes them to \p OutIt.
+// Invalid indexes become negative numbers.
+template <typename ForwardIter, typename OutputIter>
+void makeSVIIndexesOperandIndexes(const ShuffleVectorInst &SI,
+                                  const Value &Operand, ForwardIter FirstIt,
+                                  ForwardIter LastIt, OutputIter OutIt) {
+  int FirstOpSize = SI.getOperand(0)->getType()->getVectorNumElements();
+  if (&Operand == SI.getOperand(0)) {
+    std::transform(FirstIt, LastIt, OutIt, [FirstOpSize](int MaskVal) {
+      if (MaskVal >= FirstOpSize)
+        return -1;
+      return MaskVal;
+    });
+    return;
+  }
+  assert(&Operand == SI.getOperand(1) &&
+         "wrong argument: a shufflevector operand was expected");
+  std::transform(FirstIt, LastIt, OutIt,
+                 [FirstOpSize](int MaskVal) { return MaskVal - FirstOpSize; });
+}
+
+// Matches "vector" region (with vstride == 0) pattern in
+// [\p FirstIt, \p LastIt) indexes.
+// Uses info in \p FirstElemRegion, adds defined Width, Stride and
+// new NumElements to \p FirstElemRegion and returns resulting region.
+//
+// Arguments:
+//    [\p FirstIt, \p LastIt) is the range of indexes into some vector.
+//    Negative index means invalid index.
+//    \p FirstElemRegion describes one element region with only one index
+//    *FirstIt.
+template <typename ForwardIter>
+Region matchVectorRegionByIndexes(Region FirstElemRegion, ForwardIter FirstIt,
+                                  ForwardIter LastIt) {
+  assert(FirstIt != LastIt && "the range must contain at least 1 element");
+
+  if (std::distance(FirstIt, LastIt) == 1)
+    return FirstElemRegion;
+  int Stride = *std::next(FirstIt) - *FirstIt;
+  if (Stride < 0)
+    return FirstElemRegion;
+  auto NewRowIt =
+      std::adjacent_find(FirstIt, LastIt, [Stride](int First, int Second) {
+        return Second < 0 || Second - First != Stride;
+      });
+  if (NewRowIt != LastIt) {
+    ++NewRowIt;
+  }
+  int Width = std::distance(FirstIt, NewRowIt);
+  assert(Width > 0 && "should be at least 1 according to algorithm");
+  if (Width == 1)
+    // Stride doesn't play role when the Width is 1.
+    // Also it prevents from writing to big value in the region.
+    Stride = 0;
+  FirstElemRegion.Stride = Stride;
+  FirstElemRegion.Width = Width;
+  FirstElemRegion.NumElements = Width;
+  return FirstElemRegion;
+}
+
+// Matches "matrix" region (vstride may not equal to 0) pattern in
+// [\p FirstIt, \p LastIt) index.
+// Uses info in \p FirstRowRegion, adds defined VStride and new NumElements to
+// \p FirstRowRegion and returns resulting region.
+//
+// Arguments:
+//    [\p FirstIt, \p LastIt) is the range of indexes into some vector.
+//    Negative index means invalid index.
+//    \p FirstRowRegion describes "vector" region (with vstride == 0),
+//      which is formed by first 'FirstRowRegion.NumElements' elements
+//      of the range.
+template <typename ForwardIter>
+Region matchMatrixRegionByIndexes(Region FirstRowRegion, ForwardIter FirstIt,
+                                  ForwardIter LastIt) {
+  assert(FirstRowRegion.NumElements == FirstRowRegion.Width &&
+         FirstRowRegion.VStride == 0 &&
+         "wrong argunent: vector region (with no vstride) was expected");
+
+//  TODO: rewrite this assert to remove VS build error
+//  assert(std::distance(FirstIt, LastIt) >= FirstRowRegion.Width &&
+//         "wrong argument: number of indexes must be at least equal to region "
+//         "width");
+
+  auto FirstRowEndIt = std::next(FirstIt, FirstRowRegion.Width);
+  if (FirstRowEndIt == LastIt)
+    return FirstRowRegion;
+  int VStride = *FirstRowEndIt - *FirstIt;
+  if (VStride < 0)
+    return FirstRowRegion;
+
+  int Width = FirstRowRegion.Width;
+  int VDistance = VStride;
+  int NumElements = Width;
+  for (auto It = FirstRowEndIt; It != LastIt; advanceSafe(It, LastIt, Width),
+            NumElements += Width, VDistance += VStride) {
+    if (std::distance(It, LastIt) < Width ||
+        !std::equal(FirstIt, FirstRowEndIt, It,
+                    [VDistance](int Reference, int Current) {
+                      return Current - Reference == VDistance && Current >= 0;
+                    }))
+      break;
+  }
+  if (NumElements == Width)
+    // VStride doesn't play role when the Width is equal to NumElements.
+    // Also it prevents from writing to big value in the region.
+    VStride = 0;
+  FirstRowRegion.VStride = VStride;
+  FirstRowRegion.NumElements = NumElements;
+  return FirstRowRegion;
+}
+
+// Analyzes shufflevector mask starting from \p StartIdx element of it.
+// Finds the longest prefix of the cutted shufflevector mask that can be
+// represented as a region of one operand of the instruction.
+// Returns the operand and its region.
+//
+// For example:
+// {0, 1, 3, 4, 25, 16 ...} -> first 4 elements form a region:
+//                             <3;2,1> vstride=3, width=2, stride=1
+ShuffleVectorAnalyzer::OperandRegionInfo
+ShuffleVectorAnalyzer::getMaskRegionPrefix(int StartIdx) {
+  assert(StartIdx >= 0 && 
+         StartIdx < static_cast<int>(SI->getShuffleMask().size()) &&
+         "Start index is out of bound");
+
+  auto MaskVals = SI->getShuffleMask();
+  auto StartIt = std::next(MaskVals.begin(), StartIdx);
+  OperandRegionInfo Res = matchOneElemRegion(*SI, *StartIt);
+
+  if (StartIdx == MaskVals.size() - 1)
+    return Res;
+
+  makeSVIIndexesOperandIndexes(*SI, *Res.Op, StartIt, MaskVals.end(), StartIt);
+
+  Res.R = matchVectorRegionByIndexes(std::move(Res.R), StartIt, MaskVals.end());
+  Res.R = matchMatrixRegionByIndexes(std::move(Res.R), StartIt, MaskVals.end());
+  return Res;
+}
+
+/***********************************************************************
+ * ShuffleVectorAnalyzer::getAsUnslice : see if the shufflevector is an
+ *    unslice where the "old value" is operand 0 and operand 1 is another
+ *    shufflevector and operand 0 of that is the "new value"
+ *
+ * Return:  start index, or -1 if it is not an unslice
+ */
+int ShuffleVectorAnalyzer::getAsUnslice()
+{
+  auto SI2 = dyn_cast<ShuffleVectorInst>(SI->getOperand(1));
+  if (!SI2)
+    return -1;
+  Constant *MaskVec = cast<Constant>(SI->getOperand(2));
+  // Find prefix of undef or elements from operand 0.
+  unsigned OldWidth = SI2->getType()->getVectorNumElements(); 
+  unsigned NewWidth = SI2->getOperand(0)->getType()->getVectorNumElements(); 
+  unsigned Prefix = 0;
+  for (;; ++Prefix) {
+    if (Prefix == OldWidth - NewWidth)
+      break;
+    Constant *IdxC = MaskVec->getAggregateElement(Prefix);
+    if (isa<UndefValue>(IdxC))
+      continue;
+    unsigned Idx = cast<ConstantInt>(IdxC)->getZExtValue();
+    if (Idx == OldWidth)
+      break; // found end of prefix
+    if (Idx != Prefix)
+      return -1; // not part of prefix
+  }
+  // Check that the whole of SI2 operand 0 follows
+  for (unsigned i = 1; i != NewWidth; ++i) {
+    Constant *IdxC = MaskVec->getAggregateElement(Prefix + i);
+    if (isa<UndefValue>(IdxC))
+      continue;
+    if (cast<ConstantInt>(IdxC)->getZExtValue() != i + OldWidth)
+      return -1; // not got whole of SI2 operand 0
+  }
+  // Check that the remainder is undef or elements from operand 0.
+  for (unsigned i = Prefix + NewWidth; i != OldWidth; ++i) {
+    Constant *IdxC = MaskVec->getAggregateElement(i);
+    if (isa<UndefValue>(IdxC))
+      continue;
+    if (cast<ConstantInt>(IdxC)->getZExtValue() != i)
+      return -1;
+  }
+  // Check that the first Prefix elements of SI2 come from its operand 1.
+  Constant *MaskVec2 = cast<Constant>(SI2->getOperand(2));
+  for (unsigned i = 0; i != Prefix; ++i) {
+    Constant *IdxC = MaskVec2->getAggregateElement(Prefix + i);
+    if (isa<UndefValue>(IdxC))
+      continue;
+    if (cast<ConstantInt>(IdxC)->getZExtValue() != i)
+      return -1;
+  }
+  // Success.
+  return Prefix;
+}
+
+/***********************************************************************
+ * ShuffleVectorAnalyzer::getAsSplat : if shufflevector is a splat, get the
+ *      splatted input, with its vector index if the input is a vector
+ */
+ShuffleVectorAnalyzer::SplatInfo ShuffleVectorAnalyzer::getAsSplat()
+{
+  Value *InVec1 = SI->getOperand(0);
+  Value *InVec2 = SI->getOperand(1);
+  Constant *MaskVec = cast<Constant>(SI->getOperand(2));
+  ConstantInt *IdxVal = dyn_cast_or_null<ConstantInt>(MaskVec->getSplatValue());
+  if (!IdxVal)
+    return SplatInfo(0, 0);
+  // The mask is a splat. Work out which element of which input vector
+  // it refers to.
+  unsigned ShuffleIdx = IdxVal->getSExtValue();
+  unsigned InVec1NumElements = InVec1->getType()->getVectorNumElements();
+  if (ShuffleIdx >= InVec1NumElements) {
+    ShuffleIdx -= InVec1NumElements;
+    InVec1 = InVec2;
+  }
+  if (auto IE = dyn_cast<InsertElementInst>(InVec1)) {
+    if (InVec1NumElements == 1 || isa<UndefValue>(IE->getOperand(0)))
+      return SplatInfo(IE->getOperand(1), 0);
+    // Even though this is a splat, the input vector has more than one
+    // element. IRBuilder::CreateVectorSplat does this. See if the input
+    // vector is the result of an insertelement at the right place, and
+    // if so return that. Otherwise we end up allocating
+    // an unnecessarily large register.
+    if (auto ConstIdx = dyn_cast<ConstantInt>(IE->getOperand(2)))
+      if (ConstIdx->getSExtValue() == ShuffleIdx)
+        return SplatInfo(IE->getOperand(1), 0);
+  }
+  return SplatInfo(InVec1, ShuffleIdx);
+}
+
+Value *ShuffleVectorAnalyzer::serialize() {
+  unsigned Cost0 = getSerializeCost(0);
+  unsigned Cost1 = getSerializeCost(1);
+
+  Value *Op0 = SI->getOperand(0);
+  Value *Op1 = SI->getOperand(1);
+  Value *V = Op0;
+  bool UseOp0AsBase = Cost0 <= Cost1;
+  if (!UseOp0AsBase)
+    V = Op1;
+
+  // Expand or shink the initial value if sizes mismatch.
+  unsigned NElts = SI->getType()->getVectorNumElements();
+  unsigned M = V->getType()->getVectorNumElements();
+  bool SkipBase = true;
+  if (M != NElts) {
+    if (auto C = dyn_cast<Constant>(V)) {
+      SmallVector<Constant *, 16> Vals;
+      for (unsigned i = 0; i < NElts; ++i) {
+        Type *Ty = C->getType()->getVectorElementType();
+        Constant *Elt =
+            (i < M) ? C->getAggregateElement(i) : UndefValue::get(Ty);
+        Vals.push_back(Elt);
+      }
+      V = ConstantVector::get(Vals);
+    } else {
+      // Need to insert individual elements.
+      V = UndefValue::get(SI->getType());
+      SkipBase = false;
+    }
+  }
+
+  IRBuilder<> Builder(SI);
+  for (unsigned i = 0; i < NElts; ++i) {
+    // Undef index returns -1.
+    int idx = SI->getMaskValue(i);
+    if (idx < 0)
+      continue;
+    if (SkipBase) {
+      if (UseOp0AsBase && idx == i)
+        continue;
+      if (!UseOp0AsBase && idx == i + M)
+        continue;
+    }
+
+    Value *Vi = nullptr;
+    if (idx < (int)M)
+      Vi = Builder.CreateExtractElement(Op0, idx, "");
+    else
+      Vi = Builder.CreateExtractElement(Op1, idx - M, "");
+    if (!isa<UndefValue>(Vi))
+      V = Builder.CreateInsertElement(V, Vi, i, "");
+  }
+
+  return V;
+}
+
+unsigned ShuffleVectorAnalyzer::getSerializeCost(unsigned i) {
+  unsigned Cost = 0;
+  Value *Op = SI->getOperand(i);
+  if (!isa<Constant>(Op) && Op->getType() != SI->getType())
+    Cost += Op->getType()->getVectorNumElements();
+
+  unsigned NElts = SI->getType()->getVectorNumElements();
+  for (unsigned j = 0; j < NElts; ++j) {
+    // Undef index returns -1.
+    int idx = SI->getMaskValue(j);
+    if (idx < 0)
+      continue;
+    // Count the number of elements out of place.
+    unsigned M = Op->getType()->getVectorNumElements();
+    if ((i == 0 && idx != j) || (i == 1 && idx != j + M))
+      Cost++;
+  }
+
+  return Cost;
+}
+
+/***********************************************************************
+ * adjustPhiNodesForBlockRemoval : adjust phi nodes when removing a block
+ *
+ * Enter:   Succ = the successor block to adjust phi nodes in
+ *          BB = the block being removed
+ *
+ * This modifies each phi node in Succ as follows: the incoming for BB is
+ * replaced by an incoming for each of BB's predecessors.
+ */
+void genx::adjustPhiNodesForBlockRemoval(BasicBlock *Succ, BasicBlock *BB)
+{
+  for (auto i = Succ->begin(), e = Succ->end(); i != e; ++i) {
+    auto Phi = dyn_cast<PHINode>(&*i);
+    if (!Phi)
+      break;
+    // For this phi node, get the incoming for BB.
+    int Idx = Phi->getBasicBlockIndex(BB);
+    assert(Idx >= 0);
+    Value *Incoming = Phi->getIncomingValue(Idx);
+    // Iterate through BB's predecessors. For the first one, replace the
+    // incoming block with the predecessor. For subsequent ones, we need
+    // to add new phi incomings.
+    auto pi = pred_begin(BB), pe = pred_end(BB);
+    assert(pi != pe);
+    Phi->setIncomingBlock(Idx, *pi);
+    for (++pi; pi != pe; ++pi)
+      Phi->addIncoming(Incoming, *pi);
+  }
+}
+
+/***********************************************************************
+ * sinkAdd : sink add(s) in address calculation
+ *
+ * Enter:   IdxVal = the original index value
+ *
+ * Return:  the new calculation for the index value
+ *
+ * This detects the case when a variable index in a region or element access
+ * is one or more constant add/subs then some mul/shl/truncs. It sinks
+ * the add/subs into a single add after the mul/shl/truncs, so the add
+ * stands a chance of being baled in as a constant offset in the region.
+ *
+ * If add sinking is successfully applied, it may leave now unused
+ * instructions behind, which need tidying by a later dead code removal
+ * pass.
+ */
+Value *genx::sinkAdd(Value *V) {
+  Instruction *IdxVal = dyn_cast<Instruction>(V);
+  if (!IdxVal)
+    return V;
+  // Collect the scale/trunc/add/sub/or instructions.
+  int Offset = 0;
+  SmallVector<Instruction *, 8> ScaleInsts;
+  Instruction *Inst = IdxVal;
+  int Scale = 1;
+  bool NeedChange = false;
+  for (;;) {
+    if (isa<TruncInst>(Inst))
+      ScaleInsts.push_back(Inst);
+    else {
+      if (!isa<BinaryOperator>(Inst))
+        break;
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Inst->getOperand(1))) {
+        if (Inst->getOpcode() == Instruction::Mul) {
+          Scale *= CI->getSExtValue();
+          ScaleInsts.push_back(Inst);
+        } else if (Inst->getOpcode() == Instruction::Shl) {
+          Scale <<= CI->getSExtValue();
+          ScaleInsts.push_back(Inst);
+        } else if (Inst->getOpcode() == Instruction::Add) {
+          Offset += CI->getSExtValue() * Scale;
+          if (V != Inst)
+            NeedChange = true;
+        } else if (Inst->getOpcode() == Instruction::Sub) {
+          Offset -= CI->getSExtValue() * Scale;
+          if (IdxVal != Inst)
+            NeedChange = true;
+        } else if(Inst->getOpcode() == Instruction::Or) {
+          if (!haveNoCommonBitsSet(Inst->getOperand(0),
+                                  Inst->getOperand(1),
+                                  Inst->getModule()->getDataLayout()))
+            break;
+          Offset += CI->getSExtValue() * Scale;
+          if (V != Inst)
+            NeedChange = true;
+        } else
+          break;
+      } else
+        break;
+    }
+    Inst = dyn_cast<Instruction>(Inst->getOperand(0));
+    if (!Inst)
+      return V;
+  }
+  if (!NeedChange)
+    return V;
+  // Clone the scale and trunc instructions, starting with the value that
+  // was input to the add(s).
+  for (SmallVectorImpl<Instruction *>::reverse_iterator i = ScaleInsts.rbegin(),
+                                                        e = ScaleInsts.rend();
+       i != e; ++i) {
+    Instruction *Clone = (*i)->clone();
+    Clone->insertBefore(IdxVal);
+    Clone->setName((*i)->getName());
+    Clone->setOperand(0, Inst);
+    Inst = Clone;
+  }
+  // Create a new add instruction.
+  Inst = BinaryOperator::Create(
+      Instruction::Add, Inst,
+      ConstantInt::get(Inst->getType(), (int64_t)Offset, true /*isSigned*/),
+      Twine("addr_add"), IdxVal);
+  Inst->setDebugLoc(IdxVal->getDebugLoc());
+  return Inst;
+}
+
+/***********************************************************************
+* reorderBlocks : reorder blocks to increase fallthrough, and specifically
+*    to satisfy the requirements of SIMD control flow
+*/
+#define SUCCSZANY     (true)
+#define SUCCHASINST   (succ->size() > 1)
+#define SUCCNOINST    (succ->size() <= 1)
+#define SUCCANYLOOP   (true)
+
+#define PUSHSUCC(BLK, C1, C2) \
+        for(succ_iterator succIter = succ_begin(BLK), succEnd = succ_end(BLK); \
+          succIter!=succEnd; ++succIter) {                                   \
+          llvm::BasicBlock *succ = *succIter;                                \
+          if (!visitSet.count(succ) && C1 && C2) {                           \
+            visitVec.push_back(succ);                                        \
+            visitSet.insert(succ);                                           \
+            break;                                                           \
+          }                                                                  \
+        }
+
+static bool HasSimdGotoJoinInBlock(BasicBlock *BB)
+{
+  for (BasicBlock::iterator BBI = BB->begin(),
+                            BBE = BB->end();
+       BBI != BBE; ++BBI) {
+    auto IID = GenXIntrinsic::getGenXIntrinsicID(&*BBI);
+    if (IID == GenXIntrinsic::genx_simdcf_goto ||
+        IID == GenXIntrinsic::genx_simdcf_join)
+      return true;
+  }
+  return false;
+}
+
+void genx::LayoutBlocks(Function &func, LoopInfo &LI)
+{
+  std::vector<llvm::BasicBlock*> visitVec;
+  std::set<llvm::BasicBlock*> visitSet;
+  // Insertion Position per loop header
+  std::map<llvm::BasicBlock*, llvm::BasicBlock*> InsPos;
+
+  llvm::BasicBlock* entry = &(func.getEntryBlock());
+  visitVec.push_back(entry);
+  visitSet.insert(entry);
+  InsPos[entry] = entry;
+
+  while (!visitVec.empty()) {
+    llvm::BasicBlock* blk = visitVec.back();
+    llvm::Loop *curLoop = LI.getLoopFor(blk);
+    if (curLoop) {
+      auto hd = curLoop->getHeader();
+      if (blk == hd && InsPos.find(hd) == InsPos.end()) {
+        InsPos[blk] = blk;
+      }
+    }
+    // push: time for DFS visit
+    PUSHSUCC(blk, SUCCANYLOOP, SUCCNOINST);
+    if (blk != visitVec.back())
+      continue;
+    // push: time for DFS visit
+    PUSHSUCC(blk, SUCCANYLOOP, SUCCHASINST);
+    // pop: time to move the block to the right location
+    if (blk == visitVec.back()) {
+      visitVec.pop_back();
+      if (curLoop) {
+        auto hd = curLoop->getHeader();
+        if (blk != hd) {
+          // move the block to the beginning of the loop 
+          auto insp = InsPos[hd];
+          assert(insp);
+          if (blk != insp) {
+            blk->moveBefore(insp);
+            InsPos[hd] = blk;
+          }
+        }
+        else {
+          // move the entire loop to the beginning of
+          // the parent loop
+          auto LoopStart = InsPos[hd];
+          assert(LoopStart);
+          auto PaLoop = curLoop->getParentLoop();
+          auto PaHd = PaLoop ? PaLoop->getHeader() : entry;
+          auto insp = InsPos[PaHd];
+          if (LoopStart == hd) {
+            // single block loop
+            hd->moveBefore(insp);
+          }
+          else {
+            // loop-header is not moved yet, so should be at the end
+            // use splice
+            llvm::Function::BasicBlockListType& BBList = func.getBasicBlockList();
+            BBList.splice(insp->getIterator(), BBList, LoopStart->getIterator(),
+              hd->getIterator());
+            hd->moveBefore(LoopStart);
+          }
+          InsPos[PaHd] = hd;
+        }
+      }
+      else {
+        auto insp = InsPos[entry];
+        if (blk != insp) {
+          blk->moveBefore(insp);
+          InsPos[entry] = blk;
+        }
+      }
+    }
+  }
+
+  // fix the loop-exit pattern, put break-blocks into the loop
+  for (llvm::Function::iterator blkIter = func.begin(), blkEnd = func.end();
+       blkIter != blkEnd; ++blkIter) {
+    llvm::BasicBlock *blk = &(*blkIter);
+    llvm::Loop *curLoop = LI.getLoopFor(blk);
+    bool allPredLoopExit = true;
+    unsigned numPreds = 0;
+    llvm::SmallPtrSet<llvm::BasicBlock *, 4> predSet;
+    for (pred_iterator predIter = pred_begin(blk), predEnd = pred_end(blk);
+         predIter != predEnd; ++predIter) {
+      llvm::BasicBlock *pred = *predIter;
+      numPreds++;
+      llvm::Loop *predLoop = LI.getLoopFor(pred);
+      if (curLoop == predLoop) {
+        llvm::BasicBlock *predPred = pred->getSinglePredecessor();
+        if (predPred) {
+          llvm::Loop *predPredLoop = LI.getLoopFor(predPred);
+          if (predPredLoop != curLoop &&
+              (!curLoop || curLoop->contains(predPredLoop))) {
+            if (!HasSimdGotoJoinInBlock(pred)) {
+              predSet.insert(pred);
+            } else {
+              allPredLoopExit = false;
+              break;
+            }
+          }
+        }
+      } else if (!curLoop || curLoop->contains(predLoop))
+        continue;
+      else {
+        allPredLoopExit = false;
+        break;
+      }
+    }
+    if (allPredLoopExit && numPreds > 1) {
+      for (SmallPtrSet<BasicBlock *, 4>::iterator predIter = predSet.begin(),
+                                                  predEnd = predSet.end();
+           predIter != predEnd; ++predIter) {
+        llvm::BasicBlock *pred = *predIter;
+        llvm::BasicBlock *predPred = pred->getSinglePredecessor();
+        assert(predPred);
+        pred->moveAfter(predPred);
+      }
+    }
+  }
+}
+
+void genx::LayoutBlocks(Function &func)
+{
+  std::vector<llvm::BasicBlock*> visitVec;
+  std::set<llvm::BasicBlock*> visitSet;
+  // Reorder basic block to allow more fall-through 
+  llvm::BasicBlock* entry = &(func.getEntryBlock());
+  visitVec.push_back(entry);
+  visitSet.insert(entry);
+
+  while (!visitVec.empty()) {
+    llvm::BasicBlock* blk = visitVec.back();
+    // push in the empty successor 
+    PUSHSUCC(blk, SUCCANYLOOP, SUCCNOINST);
+    if (blk != visitVec.back())
+      continue;
+    // push in the other successor 
+    PUSHSUCC(blk, SUCCANYLOOP, SUCCHASINST);
+    // pop
+    if (blk == visitVec.back()) {
+      visitVec.pop_back();
+      if (blk != entry) {
+        blk->moveBefore(entry);
+        entry = blk;
+      }
+    }
+  }
+}
+
+// normalize g_load with bitcasts.
+//
+// When a single g_load is being bitcast'ed to different types, clone g_loads.
+bool genx::normalizeGloads(Instruction *Inst) {
+  assert(isa<LoadInst>(Inst));
+  auto LI = cast<LoadInst>(Inst);
+  if (getUnderlyingGlobalVariable(LI->getPointerOperand()) == nullptr)
+    return false;
+
+  // collect all uses connected by bitcasts.
+  std::set<BitCastInst *> Visited;
+  // Uses of this loads groupped by the use type.
+  llvm::MapVector<Type *, std::vector<BitCastInst *>> Uses;
+  // The working list.
+  std::vector<BitCastInst *> Insts;
+
+  for (auto UI : LI->users())
+    if (auto BI = dyn_cast<BitCastInst>(UI))
+      Insts.push_back(BI);
+
+  while (!Insts.empty()) {
+    BitCastInst *BCI = Insts.back();
+    Insts.pop_back();
+    if (Visited.count(BCI))
+      continue;
+
+    Uses[BCI->getType()].push_back(BCI);
+    for (auto UI : BCI->users())
+      if (auto BI = dyn_cast<BitCastInst>(UI))
+        Insts.push_back(BI);
+  }
+
+  // There are more than two uses; clone loads that can fold bitcasts.
+  if (Uses.size() <= 1)
+    return false;
+
+  // %0 = load gv
+  // %1 = bitcast %0 to t1
+  // %2 - bitcast %1 to t2
+  //
+  // ==>
+  // %0 = load gv
+  // %0.1 = load gv
+  // %1 = bitcast %0 to t1
+  // %2 - bitcast %0.1 to t2
+  Instruction *LInst = LI;
+  for (auto I = Uses.begin(); I != Uses.end(); ++I) {
+    Type *Ty = I->first;
+    if (LInst == nullptr) {
+      LInst = LI->clone();
+      LInst->insertAfter(LI);
+    }
+    Instruction *NewCI = new BitCastInst(LInst, Ty, ".clone", LInst);
+    NewCI->moveAfter(LInst);
+    auto &BInsts = I->second;
+    for (auto BI : BInsts)
+      BI->replaceAllUsesWith(NewCI);
+    LInst = nullptr;
+  }
+  return true;
+}
+
+// fold bitcast instruction into Store by change pointer type.
+Instruction *genx::foldBitCastInst(Instruction *Inst) {
+  assert(isa<LoadInst>(Inst) || isa<StoreInst>(Inst));
+  auto LI = dyn_cast<LoadInst>(Inst);
+  auto SI = dyn_cast<StoreInst>(Inst);
+
+  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+  GlobalVariable *GV = getUnderlyingGlobalVariable(Ptr);
+  if (!GV)
+    return nullptr;
+
+  if (SI) {
+    Value *Val = SI->getValueOperand();
+    if (auto CI = dyn_cast<BitCastInst>(Val)) {
+      auto SrcTy = CI->getSrcTy();
+      auto NewPtrTy = PointerType::get(SrcTy, SI->getPointerAddressSpace());
+      auto NewPtr = ConstantExpr::getBitCast(GV, NewPtrTy);
+      StoreInst *NewSI = new StoreInst(CI->getOperand(0), NewPtr,
+                                       /*volatile*/ SI->isVolatile(), Inst);
+      NewSI->takeName(SI);
+      NewSI->setDebugLoc(Inst->getDebugLoc());
+      Inst->eraseFromParent();
+      return NewSI;
+    }
+  } else if (LI && LI->hasOneUse()) {
+    if (auto CI = dyn_cast<BitCastInst>(LI->user_back())) {
+      auto NewPtrTy = PointerType::get(CI->getType(), LI->getPointerAddressSpace());
+      auto NewPtr = ConstantExpr::getBitCast(GV, NewPtrTy);
+      auto NewLI = new LoadInst(NewPtr, "",
+                                /*volatile*/ LI->isVolatile(), Inst);
+      NewLI->takeName(LI);
+      NewLI->setDebugLoc(LI->getDebugLoc());
+      CI->replaceAllUsesWith(NewLI);
+      LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+      LI->eraseFromParent();
+      return NewLI;
+    }
+  }
+
+  return nullptr;
+}
+
+const GlobalVariable *genx::getUnderlyingGlobalVariable(const Value *V) {
+  while (auto CE = dyn_cast_or_null<ConstantExpr>(V)) {
+    if (CE->getOpcode() == CastInst::BitCast)
+      V = CE->getOperand(0);
+    else
+      break;
+  }
+  return dyn_cast_or_null<GlobalVariable>(V);
+}
+
+GlobalVariable *genx::getUnderlyingGlobalVariable(Value *V) {
+  return const_cast<GlobalVariable *>(
+      getUnderlyingGlobalVariable(const_cast<const Value *>(V)));
+}
+
+bool genx::isGlobalStore(StoreInst *ST) {
+  assert(ST);
+  return getUnderlyingGlobalVariable(ST->getPointerOperand()) != nullptr;
+}
+
+bool genx::isGlobalLoad(LoadInst *LI) {
+  assert(LI);
+  return getUnderlyingGlobalVariable(LI->getPointerOperand()) != nullptr;
+}
+
+bool genx::isLegalValueForGlobalStore(Value *V, Value *StorePtr) {
+  // Value should be wrregion.
+  auto *Wrr = dyn_cast<CallInst>(V);
+  if (!Wrr || !GenXIntrinsic::isWrRegion(Wrr))
+    return false;
+
+  // With old value obtained from load instruction with StorePtr.
+  Value *OldVal =
+      Wrr->getArgOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
+  auto *LI = dyn_cast<LoadInst>(OldVal);
+  return LI && (getUnderlyingGlobalVariable(LI->getPointerOperand()) ==
+                getUnderlyingGlobalVariable(StorePtr));
+}
+
+bool genx::isGlobalStoreLegal(StoreInst *ST) {
+  assert(isGlobalStore(ST));
+  return isLegalValueForGlobalStore(ST->getValueOperand(),
+                                    ST->getPointerOperand());
+}
+
+// The following bale will produce identity moves.
+// %a0 = load m
+// %b0 = load m
+// bale {
+//   %a1 = rrd %a0, R
+//   %b1 = wrr %b0, %a1, R
+//   store %b1, m
+// }
+//
+bool genx::isIdentityBale(const Bale &B) {
+  if (!B.endsWithGStore())
+    return false;
+
+  StoreInst *ST = cast<StoreInst>(B.getHead()->Inst);
+  if (B.size() == 1) {
+    // The value to be stored should be a load from the same global.
+    auto LI = dyn_cast<LoadInst>(ST->getOperand(0));
+    return LI && getUnderlyingGlobalVariable(LI->getOperand(0)) ==
+                     getUnderlyingGlobalVariable(ST->getOperand(1));
+  }
+  if (B.size() != 3)
+    return false;
+
+  CallInst *B1 = dyn_cast<CallInst>(ST->getValueOperand());
+  GlobalVariable *GV = getUnderlyingGlobalVariable(ST->getPointerOperand());
+  if (!GenXIntrinsic::isWrRegion(B1) || !GV)
+    return false;
+  assert(B1);
+  auto B0 = dyn_cast<LoadInst>(B1->getArgOperand(0));
+  if (!B0 || GV != getUnderlyingGlobalVariable(B0->getPointerOperand()))
+    return false;
+
+  CallInst *A1 = dyn_cast<CallInst>(B1->getArgOperand(1));
+  if (!GenXIntrinsic::isRdRegion(A1))
+    return false;
+  assert(A1);
+  LoadInst *A0 = dyn_cast<LoadInst>(A1->getArgOperand(0));
+  if (!A0 || GV != getUnderlyingGlobalVariable(A0->getPointerOperand()))
+    return false;
+
+  Region R1(A1, BaleInfo());
+  Region R2(B1, BaleInfo());
+  return R1 == R2;
+}
+
+// Check that region can be represented as raw operand.
+bool genx::isValueRegionOKForRaw(Value *V, bool IsWrite,
+                                 const GenXSubtarget *ST) {
+  assert(V);
+  switch (GenXIntrinsic::getGenXIntrinsicID(V)) {
+  case GenXIntrinsic::genx_rdregioni:
+  case GenXIntrinsic::genx_rdregionf:
+    if (IsWrite)
+      return false;
+    break;
+  case GenXIntrinsic::genx_wrregioni:
+  case GenXIntrinsic::genx_wrregionf:
+    if (!IsWrite)
+      return false;
+    break;
+  default:
+    return false;
+  }
+  Region R(cast<Instruction>(V), BaleInfo());
+  return isRegionOKForRaw(R, ST);
+}
+
+bool genx::isRegionOKForRaw(const genx::Region &R, const GenXSubtarget *ST) {
+  unsigned GRFWidth = ST ? ST->getGRFWidth() : 32;
+  if (R.Indirect)
+    return false;
+  else if (R.Offset & (GRFWidth - 1)) // GRF boundary check
+    return false;
+  if (R.Width != R.NumElements)
+    return false;
+  if (R.Stride != 1)
+    return false;
+  return true;
+}
+
+bool genx::skipOptWithLargeBlock(FunctionGroup &FG) {
+  for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi) {
+    auto F = *fgi;
+    if (skipOptWithLargeBlock(*F))
+      return true;
+  }
+  return false;
+}
+
+std::string genx::getInlineAsmCodes(const InlineAsm::ConstraintInfo &Info) {
+  return Info.Codes.front();
+}
+
+bool genx::isInlineAsmMatchingInputConstraint(
+    const InlineAsm::ConstraintInfo &Info) {
+  return isdigit(Info.Codes.front()[0]);
+}
+
+genx::ConstraintType genx::getInlineAsmConstraintType(StringRef Codes) {
+  return llvm::StringSwitch<genx::ConstraintType>(Codes)
+      .Case("r", ConstraintType::Constraint_r)
+      .Case("rw", ConstraintType::Constraint_rw)
+      .Case("i", ConstraintType::Constraint_i)
+      .Case("n", ConstraintType::Constraint_n)
+      .Case("F", ConstraintType::Constraint_F)
+      .Case("cr", ConstraintType::Constraint_cr)
+      .Case("a", ConstraintType::Constraint_a)
+      .Default(ConstraintType::Constraint_unknown);
+}
+
+unsigned
+genx::getInlineAsmMatchedOperand(const InlineAsm::ConstraintInfo &Info) {
+  assert(genx::isInlineAsmMatchingInputConstraint(Info) &&
+         "Matching input expected");
+  int OperandValue = std::stoi(Info.Codes.front());
+  assert(OperandValue >= 0);
+  return OperandValue;
+}
+
+std::vector<GenXInlineAsmInfo> genx::getGenXInlineAsmInfo(MDNode *MD) {
+  std::vector<GenXInlineAsmInfo> Result;
+  for (auto &MDOp : MD->operands()) {
+    auto EntryMD = dyn_cast<MDTuple>(MDOp);
+    assert(EntryMD && EntryMD->getNumOperands() == 3 &&
+           "error setting metadata for inline asm");
+    ConstantAsMetadata *Op0 =
+        dyn_cast<ConstantAsMetadata>(EntryMD->getOperand(0));
+    ConstantAsMetadata *Op1 =
+        dyn_cast<ConstantAsMetadata>(EntryMD->getOperand(1));
+    ConstantAsMetadata *Op2 =
+        dyn_cast<ConstantAsMetadata>(EntryMD->getOperand(2));
+    assert(Op0 && Op1 && Op2 && "error setting metadata for inline asm");
+    auto CTy = static_cast<genx::ConstraintType>(
+        cast<ConstantInt>(Op0->getValue())->getZExtValue());
+    Result.emplace_back(CTy, cast<ConstantInt>(Op1->getValue())->getSExtValue(),
+                        cast<ConstantInt>(Op2->getValue())->getZExtValue());
+  }
+  return Result;
+}
+
+std::vector<GenXInlineAsmInfo> genx::getGenXInlineAsmInfo(CallInst *CI) {
+  assert(CI->isInlineAsm() && "Inline asm expected");
+  MDNode *MD = CI->getMetadata(genx::MD_genx_inline_asm_info);
+  // empty constraint info
+  if (!MD) {
+    auto *IA = cast<InlineAsm>(CI->getCalledValue());
+    assert(IA->getConstraintString().empty() &&
+           "No info only for empty constraint string");
+    (void)IA;
+    return std::vector<GenXInlineAsmInfo>();
+  }
+  return genx::getGenXInlineAsmInfo(MD);
+}
+
+bool genx::hasConstraintOfType(
+    const std::vector<GenXInlineAsmInfo> &ConstraintsInfo,
+    genx::ConstraintType CTy) {
+  return llvm::any_of(ConstraintsInfo, [&](const GenXInlineAsmInfo &Info) {
+    return Info.getConstraintType() == CTy;
+  });
+}
+
+unsigned genx::getInlineAsmNumOutputs(CallInst *CI) {
+  assert(CI->isInlineAsm() && "Inline asm expected");
+  unsigned NumOutputs;
+  if (CI->getType()->isVoidTy())
+    NumOutputs = 0;
+  else if (auto ST = dyn_cast<StructType>(CI->getType()))
+    NumOutputs = ST->getNumElements();
+  else
+    NumOutputs = 1;
+  return NumOutputs;
+}
+
+/* for <1 x Ty> returns Ty
+ * for Ty returns <1 x Ty>
+ * other cases are unsupported
+ */
+Type *genx::getCorrespondingVectorOrScalar(Type *Ty) {
+  if (Ty->isVectorTy()) {
+    assert(Ty->getVectorNumElements() == 1 &&
+      "wrong argument: scalar or degenerate vector is expected");
+    return Ty->getScalarType();
+  }
+  return VectorType::get(Ty, 1);
+}
+
+// info is at main template function
+CastInst *genx::scalarizeOrVectorizeIfNeeded(Instruction *Inst, Type *RefType) {
+  return scalarizeOrVectorizeIfNeeded(Inst, &RefType, std::next(&RefType));
+}
+
+// info is at main template function
+CastInst *genx::scalarizeOrVectorizeIfNeeded(Instruction *Inst,
+  Instruction *InstToReplace) {
+  return scalarizeOrVectorizeIfNeeded(Inst, &InstToReplace, std::next(&InstToReplace));
+}
+
+Value *genx::getFunctionPointer(Value *V) {
+  Instruction *I = nullptr;
+  while (I = dyn_cast<Instruction>(V)) {
+    if (isa<SelectInst>(I))
+      V = I->getOperand(1);
+    else if (isa<BitCastInst>(I) || isa<PtrToIntInst>(I))
+      V = I->getOperand(0);
+    else
+      break;
+  }
+  ConstantExpr *CE = nullptr;
+  while ((CE = dyn_cast<ConstantExpr>(V)) &&
+         (CE->getOpcode() == Instruction::ExtractElement ||
+          CE->isCast()))
+    V = CE->getOperand(0);
+  if (isa<Constant>(V) && V->getType()->isPointerTy() &&
+      V->getType()->getPointerElementType()->isFunctionTy()) {
+    return V;
+  }
+  return nullptr;
+}
+
+bool genx::isFuncPointerVec(Value *V, SetVector<Function *> *Funcs) {
+  bool Res = true;
+  if (V->getType()->isVectorTy() && isa<ConstantExpr>(V) &&
+      cast<ConstantExpr>(V)->getOpcode() == Instruction::BitCast) {
+    Res = getFunctionPointer(cast<ConstantExpr>(V)->getOperand(0));
+  } else if (ConstantVector *Vec = dyn_cast<ConstantVector>(V)) {
+    for (auto it = Vec->op_begin(), ie = Vec->op_end(); it != ie; it++) {
+      auto *F = getFunctionPointer(*it);
+      if (F && Funcs) {
+        Funcs->insert(cast<Function>(F));
+      } else if (!F) {
+        Res = false;
+        break;
+      }
+    }
+  } else
+    Res = false;
+  return Res;
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h
new file mode 100644
index 000000000000..467a62fec367
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h
@@ -0,0 +1,429 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+#ifndef GENX_UTIL_H
+#define GENX_UTIL_H
+
+#include "FunctionGroup.h"
+#include "GenXRegion.h"
+#include "GenXSubtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+namespace genx {
+
+// Utility function to get the integral log base 2 of an integer, or -1 if
+// the input is not a power of 2.
+inline int exactLog2(unsigned Val)
+{
+  unsigned CLZ = countLeadingZeros(Val, ZB_Width);
+  if (CLZ != 32 && 1U << (31 - CLZ) == Val)
+    return 31 - CLZ;
+  return -1;
+}
+
+// Utility function to get the log base 2 of an integer, truncated to an
+// integer, or -1 if the number is 0 or negative.
+template<typename T>
+inline int log2(T Val)
+{
+  if (Val <= 0)
+    return -1;
+  unsigned CLZ = countLeadingZeros((uint32_t)Val, ZB_Width);
+  return 31 - CLZ;
+}
+
+// createConvert : create a genx_convert intrinsic call
+CallInst *createConvert(Value *In, const Twine &Name, Instruction *InsertBefore,
+                        Module *M = nullptr);
+
+// createConvertAddr : create a genx_convert_addr intrinsic call
+CallInst *createConvertAddr(Value *In, int Offset, const Twine &Name,
+                            Instruction *InsertBefore, Module *M = nullptr);
+
+// createAddAddr : create a genx_add_addr intrinsic call
+CallInst *createAddAddr(Value *Lhs, Value *Rhs, const Twine &Name,
+                        Instruction *InsertBefore, Module *M = nullptr);
+
+CallInst *createUnifiedRet(Type *Ty, const Twine &Name, Module *M);
+
+// getPredicateConstantAsInt : get a vXi1 constant's value as a single integer
+unsigned getPredicateConstantAsInt(Constant *C);
+
+// getConstantSubvector : get a contiguous region from a vector constant
+Constant *getConstantSubvector(Constant *V, unsigned StartIdx, unsigned Size);
+
+// concatConstants : concatenate two possibly vector constants, giving a vector
+// constant
+Constant *concatConstants(Constant *C1, Constant *C2);
+
+// findClosestCommonDominator : find latest common dominator of some
+// instructions
+Instruction *findClosestCommonDominator(DominatorTree *DT,
+                                        ArrayRef<Instruction *> Insts);
+
+// convertShlShr : convert Shl followed by AShr/LShr by the same amount into
+// trunc+sext/zext
+Instruction *convertShlShr(Instruction *Inst);
+
+// splitStructPhis : split all struct phis in a function
+bool splitStructPhis(Function *F);
+
+// breakConstantExprs : break constant expressions in a function.
+bool breakConstantExprs(Function *F);
+
+// normalize g_load with bitcasts.
+//
+// When a single g_load is being bitcast'ed to different types, clone g_loads.
+bool normalizeGloads(Instruction *Inst);
+
+// fold bitcast instruction to store/load pointer operand if possible.
+// Return this new instruction or nullptr.
+Instruction *foldBitCastInst(Instruction *Inst);
+
+// Return the underlying global variable. Return nullptr if it does not exist.
+GlobalVariable *getUnderlyingGlobalVariable(Value *V);
+const GlobalVariable *getUnderlyingGlobalVariable(const Value *V);
+
+class Bale;
+
+bool isGlobalStore(StoreInst *ST);
+
+bool isGlobalLoad(LoadInst* LI);
+
+// Check that V is correct as value for global store to StorePtr.
+// This implies:
+// 1) V is wrregion W;
+// 2) Old value of W is result of gload L;
+// 3) Pointer operand of L is derived from global variable of StorePtr.
+bool isLegalValueForGlobalStore(Value *V, Value *StorePtr);
+
+// Check that global store ST operands meet condition of
+// isLegalValueForGlobalStore.
+bool isGlobalStoreLegal(StoreInst *ST);
+
+bool isIdentityBale(const Bale &B);
+
+// Check if region of value is OK for baling in to raw operand
+//
+// Enter:   V = value that is possibly rdregion/wrregion
+//          IsWrite = true if caller wants to see wrregion, false for rdregion
+//
+// The region must be constant indexed, contiguous, and start on a GRF
+// boundary.
+bool isValueRegionOKForRaw(Value *V, bool IsWrite, const GenXSubtarget *ST);
+
+// Check if region is OK for baling in to raw operand
+//
+// The region must be constant indexed, contiguous, and start on a GRF
+// boundary.
+bool isRegionOKForRaw(const genx::Region &R, const GenXSubtarget *ST);
+
+// Skip optimizations on functions with large blocks.
+inline bool skipOptWithLargeBlock(const Function &F) {
+  return std::any_of(F.begin(), F.end(),
+                     [](const BasicBlock &BB) { return BB.size() >= 5000; });
+}
+
+bool skipOptWithLargeBlock(FunctionGroup &FG);
+
+// getTwoAddressOperandNum : get operand number of two address operand
+int getTwoAddressOperandNum(CallInst *CI);
+
+// isNot : test whether an instruction is a "not" instruction (an xor with
+//    constant all ones)
+bool isNot(Instruction *Inst);
+
+// isPredNot : test whether an instruction is a "not" instruction (an xor
+//    with constant all ones) with predicate (i1 or vector of i1) type
+bool isPredNot(Instruction *Inst);
+
+// isIntNot : test whether an instruction is a "not" instruction (an xor
+//    with constant all ones) with non-predicate type
+bool isIntNot(Instruction *Inst);
+
+// if V is a function pointer return function it points to,
+//    nullptr otherwise
+Value *getFunctionPointer(Value *V);
+
+// return true if V is a const vector of function pointers,
+// fill Funcs with the functions pointed to if provided
+bool isFuncPointerVec(Value *V, SetVector<Function *> *Funcs = nullptr);
+
+// ShuffleVectorAnalyzer : class to analyze a shufflevector
+class ShuffleVectorAnalyzer {
+  ShuffleVectorInst *SI;
+
+public:
+  ShuffleVectorAnalyzer(ShuffleVectorInst *SI) : SI(SI) {}
+  // getAsSlice : return start index of slice, or -1 if shufflevector is not
+  //  slice
+  int getAsSlice();
+
+  // Replicated slice descriptor.
+  // Replicated slice (e.g. 1 2 3 1 2 3) can be parametrized by
+  // initial offset (1), slice size (3) and replication count (2).
+  struct ReplicatedSlice {
+    unsigned InitialOffset;
+    unsigned SliceSize;
+    unsigned ReplicationCount;
+    ReplicatedSlice(unsigned Offset, unsigned Size, unsigned Count)
+        : InitialOffset(Offset), SliceSize(Size), ReplicationCount(Count) {}
+  };
+
+  // isReplicatedSlice : check whether shufflevector is replicated slice.
+  // Example of replicated slice:
+  // shufflevector <3 x T> x, undef, <6 x i32> <1, 2, 1, 2, 1, 2>.
+  bool isReplicatedSlice() const;
+
+  static bool isReplicatedSlice(ShuffleVectorInst *SI) {
+    return ShuffleVectorAnalyzer(SI).isReplicatedSlice();
+  }
+
+  // When we have replicated slice, its parameters are ealisy deduced
+  // from first and last elements of mask. This function decomposes
+  // replicated slice to its parameters.
+  ReplicatedSlice getReplicatedSliceDescriptor() const {
+    assert(isReplicatedSlice() && "Expected replicated slice");
+    const unsigned TotalSize = SI->getType()->getVectorNumElements();
+    const unsigned SliceStart = SI->getMaskValue(0);
+    const unsigned SliceEnd = SI->getMaskValue(TotalSize - 1);
+    const unsigned SliceSize = SliceEnd - SliceStart + 1;
+    const unsigned ReplicationCount = TotalSize / SliceSize;
+    return ReplicatedSlice(SliceStart, SliceSize, ReplicationCount);
+  }
+
+  static ReplicatedSlice getReplicatedSliceDescriptor(ShuffleVectorInst *SI) {
+    return ShuffleVectorAnalyzer(SI).getReplicatedSliceDescriptor();
+  }
+
+  // getAsUnslice : see if the shufflevector is an
+  //     unslice where the "old value" is operand 0 and operand 1 is another
+  //     shufflevector and operand 0 of that is the "new value" Returns start
+  //     index, or -1 if it is not an unslice
+  int getAsUnslice();
+  // getAsSplat : if shufflevector is a splat, get the splatted input, with the
+  //  element's vector index if the input is a vector
+  struct SplatInfo {
+    Value *Input;
+    unsigned Index;
+    SplatInfo(Value *Input, unsigned Index) : Input(Input), Index(Index) {}
+  };
+  SplatInfo getAsSplat();
+
+  // Serialize this shuffulevector instruction.
+  Value *serialize();
+
+  // Compute the cost in terms of number of insertelement instructions needed.
+  unsigned getSerializeCost(unsigned i);
+
+  // To describe the region of one of two shufflevector instruction operands.
+  struct OperandRegionInfo {
+    Value *Op;
+    Region R;
+  };
+  OperandRegionInfo getMaskRegionPrefix(int StartIdx);
+};
+
+// adjustPhiNodesForBlockRemoval : adjust phi nodes when removing a block
+void adjustPhiNodesForBlockRemoval(BasicBlock *Succ, BasicBlock *BB);
+
+/***********************************************************************
+ * sinkAdd : sink add(s) in address calculation
+ *
+ * Enter:   IdxVal = the original index value
+ *
+ * Return:  the new calculation for the index value
+ *
+ * This detects the case when a variable index in a region or element access
+ * is one or more constant add/subs then some mul/shl/truncs. It sinks
+ * the add/subs into a single add after the mul/shl/truncs, so the add
+ * stands a chance of being baled in as a constant offset in the region.
+ *
+ * If add sinking is successfully applied, it may leave now unused
+ * instructions behind, which need tidying by a later dead code removal
+ * pass.
+ */
+Value *sinkAdd(Value *V);
+
+// Check if this is a mask packing operation, i.e. a bitcast from Vxi1 to
+// integer i8, i16 or i32.
+static inline bool isMaskPacking(const Value *V) {
+  if (auto BC = dyn_cast<BitCastInst>(V)) {
+    auto SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
+    if (!SrcTy || !SrcTy->getScalarType()->isIntegerTy(1))
+      return false;
+    unsigned NElts = SrcTy->getVectorNumElements();
+    if (NElts != 8 && NElts != 16 && NElts != 32)
+      return false;
+    return V->getType()->getScalarType()->isIntegerTy(NElts);
+  }
+  return false;
+}
+
+void LayoutBlocks(Function &func, LoopInfo &LI);
+void LayoutBlocks(Function &func);
+
+// Metadata name for inline assemly instruction
+constexpr const char *MD_genx_inline_asm_info = "genx.inlasm.constraints.info";
+
+// Inline assembly avaliable constraints
+enum class ConstraintType : uint32_t {
+  Constraint_r,
+  Constraint_rw,
+  Constraint_i,
+  Constraint_n,
+  Constraint_F,
+  Constraint_a,
+  Constraint_cr,
+  Constraint_unknown
+};
+
+// Represents info about inline asssembly operand
+class GenXInlineAsmInfo {
+  genx::ConstraintType CTy = ConstraintType::Constraint_unknown;
+  int MatchingInput = -1;
+  bool IsOutput = false;
+
+public:
+  GenXInlineAsmInfo(genx::ConstraintType Ty, int MatchingInput, bool IsOutput)
+      : CTy(Ty), MatchingInput(MatchingInput), IsOutput(IsOutput) {}
+  bool hasMatchingInput() const { return MatchingInput != -1; }
+  int getMatchingInput() const { return MatchingInput; }
+  bool isOutput() const { return IsOutput; }
+  genx::ConstraintType getConstraintType() const { return CTy; }
+};
+
+// If input input constraint has matched output operand with same constraint
+bool isInlineAsmMatchingInputConstraint(const InlineAsm::ConstraintInfo &Info);
+
+// Get matched output operand number for input operand
+unsigned getInlineAsmMatchedOperand(const InlineAsm::ConstraintInfo &Info);
+
+// Get joined string representation of constraints
+std::string getInlineAsmCodes(const InlineAsm::ConstraintInfo &Info);
+
+// Get constraint type
+genx::ConstraintType getInlineAsmConstraintType(StringRef Codes);
+
+// Get vector of inline asm info for inline assembly instruction.
+// Return empty vector if no constraint string in inline asm or
+// if called before GenXInlineAsmLowering pass.
+std::vector<GenXInlineAsmInfo> getGenXInlineAsmInfo(CallInst *CI);
+
+// Get vector of inline asm info from MDNode
+std::vector<GenXInlineAsmInfo> getGenXInlineAsmInfo(MDNode *MD);
+
+bool hasConstraintOfType(const std::vector<GenXInlineAsmInfo> &ConstraintsInfo,
+                         genx::ConstraintType CTy);
+
+// Get number of outputs for inline assembly instruction
+unsigned getInlineAsmNumOutputs(CallInst *CI);
+
+Type *getCorrespondingVectorOrScalar(Type *Ty);
+
+/* scalarVectorizeIfNeeded: scalarize of vectorize \p Inst if it is required
+ *
+ * Result of some instructions can be both Ty and <1 x Ty> value e.g. rdregion.
+ * It is sometimes required to replace uses of instructions with types
+ * [\p FirstType, \p LastType) with \p Inst. If types don't
+ * correspond this function places BitCastInst <1 x Ty> to Ty, or Ty to <1 x Ty>
+ * after \p Inst and returns the pointer to the instruction. If no cast is
+ * required, nullptr is returned.
+ */
+template <
+    typename ConstIter,
+    typename std::enable_if<
+        std::is_base_of<
+            Type, typename std::remove_pointer<typename std::iterator_traits<
+                      ConstIter>::value_type>::type>::value,
+        int>::type = 0>
+CastInst *scalarizeOrVectorizeIfNeeded(Instruction *Inst, ConstIter FirstType,
+                                       ConstIter LastType) {
+  assert(Inst && "wrong argument");
+  assert(std::all_of(FirstType, LastType,
+                     [Inst](Type *Ty) {
+                       return Ty == Inst->getType() ||
+                              Ty == getCorrespondingVectorOrScalar(
+                                        Inst->getType());
+                     }) &&
+         "wrong arguments: type of instructions must correspond");
+
+  if (Inst->getType()->isVectorTy() &&
+      Inst->getType()->getVectorNumElements() > 1)
+    return nullptr;
+  bool needBitCast = std::any_of(
+      FirstType, LastType, [Inst](Type *Ty) { return Ty != Inst->getType(); });
+  if (!needBitCast)
+    return nullptr;
+  auto *CorrespondingTy = getCorrespondingVectorOrScalar(Inst->getType());
+  auto *BC = CastInst::Create(Instruction::BitCast, Inst, CorrespondingTy);
+  BC->insertAfter(Inst);
+  return BC;
+}
+/* scalarVectorizeIfNeeded: scalarize of vectorize \p Inst if it is required
+ *
+ * Result of some instructions can be both Ty and <1 x Ty> value e.g. rdregion.
+ * It is sometimes required to replace uses of instructions of [\p
+ * FirstInstToReplace, \p LastInstToReplace) with \p Inst. If types don't
+ * correspond this function places BitCastInst <1 x Ty> to Ty, or Ty to <1 x Ty>
+ * after \p Inst and returns the pointer to the instruction. If no cast is
+ * required, nullptr is returned.
+ */
+template <typename ConstIter,
+          typename std::enable_if<
+              std::is_base_of<
+                  Instruction,
+                  typename std::remove_pointer<typename std::iterator_traits<
+                      ConstIter>::value_type>::type>::value,
+              int>::type = 0>
+CastInst *scalarizeOrVectorizeIfNeeded(Instruction *Inst,
+                                       ConstIter FirstInstToReplace,
+                                       ConstIter LastInstToReplace) {
+  std::vector<Type *> Types;
+  std::transform(FirstInstToReplace, LastInstToReplace,
+                 std::back_inserter(Types),
+                 [](Instruction *Inst) { return Inst->getType(); });
+  return scalarizeOrVectorizeIfNeeded(Inst, Types.begin(), Types.end());
+}
+
+CastInst *scalarizeOrVectorizeIfNeeded(Instruction *Inst, Type *RefType);
+
+CastInst *scalarizeOrVectorizeIfNeeded(Instruction *Inst,
+                                       Instruction *InstToReplace);
+
+} // namespace genx
+} // namespace llvm
+
+#endif // GENX_UTIL_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp
new file mode 100644
index 000000000000..1f95f5c98814
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp
@@ -0,0 +1,1177 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// The GenXVectorDecomposer class is called by by the GenXPostLegalization pass
+// to perform vector decomposition. See comment in GenXVectorDecomposer.h.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_POST_LEGALIZATION"
+
+#include "GenXVectorDecomposer.h"
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "GenXRegion.h"
+#include "GenXUtil.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace genx;
+using namespace GenXIntrinsic::GenXRegion;
+
+static cl::opt<unsigned>
+    LimitGenXVectorDecomposer("limit-genx-vector-decomposer",
+                              cl::init(UINT_MAX), cl::Hidden,
+                              cl::desc("Limit GenX vector decomposer."));
+
+static cl::opt<unsigned> GenXReportVectorDecomposerFailureThreshold(
+    "genx-report-vector-decomposer-failure-threshold", cl::init(UINT_MAX),
+    cl::Hidden,
+    cl::desc("Byte size threshold for reporting failure of GenX vector "
+             "decomposer."));
+
+static cl::opt<unsigned> GenXDefaultSelectPredicateWidth(
+    "genx-sel-width", cl::init(32), cl::Hidden,
+    cl::desc("The default width for select predicate splitting."));
+
+namespace {
+
+class DiagnosticVectorDecomposition : public DiagnosticInfo {
+private:
+  const Twine &Description;
+  Instruction *Inst;
+
+  static int KindID;
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticVectorDecomposition(Instruction *I, const Twine &Desc,
+                                DiagnosticSeverity Severity = DS_Error)
+    : DiagnosticInfo(getKindID(), Severity), Description(Desc), Inst(I) {}
+
+  void print(DiagnosticPrinter &P) const override {
+    std::string Str;
+    raw_string_ostream OS(Str);
+
+    auto DL = Inst->getDebugLoc();
+    DL.print(OS);
+
+    OS << ' ' << Description;
+    OS << '\n';
+    OS.flush();
+    P << Str;
+  }
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+int DiagnosticVectorDecomposition::KindID = 0;
+
+} // end anonymous namespace
+
+/***********************************************************************
+ * VectorDecomposer::run : run the vector decomposer on the start wrregion
+ *      instructions added with addStartWrRegion()
+ *
+ * Return:  true if code modified
+ */
+bool VectorDecomposer::run(DominatorTree *ArgDT)
+{
+  DT = ArgDT;
+  DL = &DT->getRoots().front()->getModule()->getDataLayout();
+  bool Modified = false;
+  // Process each start wrregion added with addStartWrRegion().
+  for (auto swi = StartWrRegions.begin(), swe = StartWrRegions.end();
+      swi != swe; ++swi) {
+    Instruction *Inst = *swi;
+    Modified |= processStartWrRegion(Inst);
+    clearOne();
+  }
+  for (auto i = ToDelete.begin(), e = ToDelete.end(); i != e; ++i)
+    (*i)->deleteValue();
+  clear();
+  return Modified;
+}
+
+/***********************************************************************
+ * VectorDecomposer::processStartWrRegion : process one start wrregion
+ *
+ * Enter:   Inst = the start wrregion. Note that this might have already
+ *                 been erased if it was part of an already processed web,
+ *                 so the first thing we have to do is check that.
+ *
+ * This processes one start wrregion (a wrregion with constant input). If it has
+ * not already been seen as part of another web, this processes the web
+ * containing the start wrregion.
+ *
+ * Return:  true if code modified
+ */
+bool VectorDecomposer::processStartWrRegion(Instruction *Inst)
+{
+  // Determine the web of vectors related by wrregion, phi nodes, bitcast,
+  // and determine the decomposition that we can do to the web.
+  if (!determineDecomposition(Inst))
+    return false;
+  static unsigned Count = 0;
+  if (++Count > LimitGenXVectorDecomposer)
+    return false;
+  if (LimitGenXVectorDecomposer != UINT_MAX)
+    dbgs() << "genx vector decomposer " << Count << "\n";
+  decompose();
+  clearOne();
+  return true;
+}
+
+/***********************************************************************
+ * VectorDecomposer::determineDecomposition : determine the web of vectors
+ * related by wrregion, phi nodes, bitcast, and determine the decomposition
+ * that we can do to the web
+ *
+ * Enter:   Inst = the start wrregion. Note that this might have already
+ *                 been erased if it was part of an already processed web,
+ *                 so the first thing we have to do is check that.
+ *
+ * Return:  true if decomposition possible; Decomposition and Offsets set up
+ *          as described in the comment near the end of this function
+ */
+bool VectorDecomposer::determineDecomposition(Instruction *Inst)
+{
+  if (Seen.find(Inst) != Seen.end())
+    return false; // This start wrregion already processed in some other web
+                  // (and may have been erased).
+  NotDecomposingReportInst = Inst;
+  Web.clear();
+  Decomposition.clear();
+  unsigned NumGrfs = alignTo<256>(DL->getTypeSizeInBits(Inst->getType())) / 256;
+  if (NumGrfs == 1)
+    return false; // Ignore single GRF vector.
+  LLVM_DEBUG(dbgs() << "VectorDecomposer::determineDecomposition(" << Inst->getName() << ")\n");
+  NotDecomposing = false;
+  for (unsigned i = 0; i != NumGrfs; ++i)
+    Decomposition.push_back(i);
+  addToWeb(Inst);
+  for (unsigned Idx = 0; Idx != Web.size(); ++Idx) {
+    Inst = Web[Idx];
+    // Look at the def of this value.
+    if (GenXIntrinsic::isWrRegion(Inst)) {
+      // wrregion. If the "old value of vector" input is not constant, include
+      // it in the web.
+      addToWeb(Inst->getOperand(0), Inst);
+    } else if (auto Phi = dyn_cast<PHINode>(Inst)) {
+      // Phi node. Add all incomings to the web.
+      for (unsigned j = 0, je = Phi->getNumIncomingValues(); j != je; ++j)
+        addToWeb(Phi->getIncomingValue(j), Phi);
+    } else if (isa<BitCastInst>(Inst)) {
+      // Bitcast. Add the input to the web. But a bitcast with non-instruction
+      // input confuses this algorithm, so in that case disable it. We're not
+      // really expecting a bitcast with constant input anyway, although we
+      // might get one with arg input.
+      if (isa<Instruction>(Inst->getOperand(0)))
+        addToWeb(Inst->getOperand(0), Inst);
+      else
+        setNotDecomposing(Inst, "use of function argument or constant");
+    } else {
+      // Any other def. This stops decomposition.
+      if ((isa<CallInst>(Inst) && !GenXIntrinsic::isAnyNonTrivialIntrinsic(Inst))
+          || isa<ExtractValueInst>(Inst))
+        setNotDecomposing(Inst, "return value from call");
+      else
+        setNotDecomposing(Inst, "other non-decomposable definition");
+    }
+    // Look at the uses of this value.
+    for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+      auto user = cast<Instruction>(ui->getUser());
+      if (auto Phi = dyn_cast<PHINode>(user)) {
+        // Use in a phi node. Add the result of the phi node and all the other
+        // incomings to the web.
+        addToWeb(Phi);
+        for (unsigned j = 0, je = Phi->getNumIncomingValues(); j != je; ++j) {
+          auto Incoming = dyn_cast<Instruction>(Phi->getIncomingValue(j));
+          if (Incoming && Incoming != Inst)
+            addToWeb(Incoming, Phi);
+        }
+        continue;
+      }
+      if ((GenXIntrinsic::isWrRegion(user) && !ui->getOperandNo())
+          || isa<BitCastInst>(user)) {
+        // Use as the "old value of vector" operand of a wrregion, or in a
+        // bitcast. Add the result of the wrregion to the web.
+        addToWeb(user);
+        continue;
+      }
+      if (GenXIntrinsic::isRdRegion(user) && !ui->getOperandNo()) {
+        // Use as the vector value in rdregion. Adjust decomposition.
+        adjustDecomposition(user);
+        continue;
+      }
+      // We have some other use that stops us decomposing this web. (We
+      // continue gathering the web anyway so that all values in it get put
+      // in the Seen set.)
+      if (isa<InsertValueInst>(user) || isa<ReturnInst>(user))
+        setNotDecomposing(user, "use as return value");
+      else if (isa<CallInst>(user) && !GenXIntrinsic::isAnyNonTrivialIntrinsic(user))
+        setNotDecomposing(user, "use as call argument");
+      else
+        setNotDecomposing(user, "other non-decomposable use");
+    }
+  }
+  if (NotDecomposing)
+    return false;
+  // Now we have Decomposition[] set to reflect how we can decompose the GRFs
+  // of the vector. A range of Decomposition[i] with the same value need to
+  // be kept together in the same vector. Further, for the start of such a
+  // range, Decomposition[i] == i. So for example the array might be set to
+  // { 0, 0, 2, 2, 4, 4, 4, 4, 8, 8 }.
+  //
+  // Change Decomposition[] so the indices used are contiguous, changing the
+  // example above to { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3 }, and create the Offsets[]
+  // array to translate a value from Decomposition[] into the GRF offset, so
+  // for this example { 0, 2, 4, 8 }.
+  Offsets.clear();
+  for (unsigned Last = UINT_MAX, i = 0, e = Decomposition.size(); i != e; ++i) {
+    if (Decomposition[i] != Last) {
+      Offsets.push_back(Decomposition[i]);
+      Last = Decomposition[i];
+    }
+    Decomposition[i] = Offsets.size() - 1;
+  }
+  LLVM_DEBUG(
+    dbgs() << "decompose to";
+    for (unsigned i = 0; i != Decomposition.size(); ++i)
+      dbgs() << " " << Decomposition[i];
+    dbgs() << ":";
+    for (unsigned i = 0; i != Offsets.size(); ++i)
+      dbgs() << " " << Offsets[i];
+    dbgs() << ":";
+    for (unsigned i = 0; i != Web.size(); ++i)
+      dbgs() << " " << Web[i]->getName();
+    dbgs() << "\n"
+  );
+  if (Offsets.size() == 1) {
+    setNotDecomposing(0, "reads and writes in overlapping regions");
+    LLVM_DEBUG(dbgs() << "no decomposition\n");
+    return false;
+  }
+  return true;
+}
+
+/***********************************************************************
+ * addToWeb : add value to current vector web, adjusting decompose size
+ *    if it is a wrregion
+ *
+ * Enter:   V = value to add (if it is an instruction)
+ *          User = instruction V is used in, for reporting failure to
+ *                decompose if V is an Argument
+ */
+void VectorDecomposer::addToWeb(Value *V, Instruction *User)
+{
+  if (isa<Constant>(V))
+    return;
+  auto Inst = dyn_cast<Instruction>(V);
+  if (!Inst) {
+    // Cannot decompose with an arg in the web.
+    setNotDecomposing(User, "use of function argument");
+    return;
+  }
+  if (Seen.find(Inst) != Seen.end())
+    return; // already in the web
+  // Add to the web.
+  LLVM_DEBUG(dbgs() << "  addToWeb(" << V->getName() << ")\n");
+  Seen.insert(Inst);
+  Web.push_back(Inst);
+  if (!GenXIntrinsic::isWrRegion(Inst))
+    return;
+  // It is a wrregion. Adjust decomposition.
+  adjustDecomposition(Inst);
+}
+
+/***********************************************************************
+ * adjustDecomposition : adjust web decomposition for region
+ *
+ * Enter:   Inst = rdregion or wrregion instruction
+ *
+ * The vector will be decomposed into contiguous blocks of GRFs. This
+ * detects if the region accesses multiple GRFs currently slated to be in
+ * different decomposed vectors, and if so marks them as needing to be
+ * in the same decomposed vector.
+ */
+void VectorDecomposer::adjustDecomposition(Instruction *Inst)
+{
+  if (Decomposition.empty())
+    return; // Decomposition[] not set up yet
+  Region R(Inst, BaleInfo());
+  if (R.Indirect) {
+    setNotDecomposing(Inst, "indirect region");
+    return; // cannot decompose if indirect
+  }
+  if (NotDecomposing)
+    return; // decomposition of this vector already disabled
+  // Compute byte offset of last byte accessed in the region. (This is after
+  // legalization so we can assume that strides are non-negative.)
+  unsigned Last = 0;
+  if (R.Width != R.NumElements)
+    Last = (R.NumElements / R.Width - 1) * R.VStride;
+  Last += (R.Width - 1) * R.Stride;
+  Last = R.Offset + Last * R.ElementBytes;
+  // Compute the GRF number of the first and last byte of the region.
+  unsigned First = R.Offset / 32U;
+  Last /= 32U;
+  if ((First >= Decomposition.size()) || (Last >= Decomposition.size())) {
+    setNotDecomposing(Inst, "out-of-bounds");
+    return; // don't attempt to decompose out-of-bounds accesses
+  }
+  if (First != Last) {
+    // This region spans more than one GRF. Ensure they are all in the same
+    // decomposed vector.
+    for (unsigned i = Last + 1;
+        i != Decomposition.size() && Decomposition[i] == Decomposition[Last];
+        ++i)
+      Decomposition[i] = Decomposition[First];
+    for (unsigned i = First + 1; i != Last + 1; ++i)
+      Decomposition[i] = Decomposition[First];
+  }
+}
+
+/***********************************************************************
+ * reportLocation : report location of a DebugLoc, with nested inline funcs
+ */
+static void reportLocation(const LLVMContext &Ctx, const DebugLoc &DL, raw_ostream &OS)
+{
+  if (auto InlinedAt = DL.getInlinedAt()) {
+    reportLocation(Ctx, DebugLoc(InlinedAt), OS);
+    OS << ": in function inlined here:\n";
+  }
+  StringRef Filename = "<unknown>";
+  unsigned Line = 0;
+  unsigned Col = 0;
+  if (!DL) {
+    Filename = DL->getFilename();
+    Line = DL.getLine();
+    Col = DL.getCol();
+  }
+  OS << Filename;
+  if (Line) {
+    OS << ":" << Line;
+    if (Col)
+      OS << ":" << Col;
+  }
+}
+
+static DILocalVariable *getVariable(IntrinsicInst *II) {
+  do {
+      Value *V = II->getOperand(0);
+      Metadata *M = ValueAsMetadata::get(V);
+      if (auto DbgNode = MetadataAsValue::getIfExists(V->getContext(), M))
+        for (auto *U : DbgNode->users())
+          if (auto DVI = dyn_cast<DbgValueInst>(U))
+            return DVI->getVariable();
+      if (!GenXIntrinsic::isWrRegion(V))
+        break;
+      II = cast<IntrinsicInst>(V);
+  } while (1);
+
+  return nullptr;
+}
+
+/***********************************************************************
+ * setNotDecomposing : set NotDecomposing flag and report to user
+ *
+ * Enter:   Inst = instruction to report at (0 to use same location as
+ *                  NotDecomposingReportInst, the "first write" to the web)
+ *          Text = message
+ */
+void VectorDecomposer::setNotDecomposing(Instruction *Inst, const char *Text)
+{
+  NotDecomposing = true;
+  if (NotDecomposingReportInst) {
+    unsigned Bytes = NotDecomposingReportInst->getType()
+          ->getPrimitiveSizeInBits() / 8U;
+    if (Bytes < GenXReportVectorDecomposerFailureThreshold)
+      return;
+    reportLocation(Inst->getContext(),
+        NotDecomposingReportInst->getDebugLoc(), dbgs());
+    dbgs() << ": in decomposition candidate ("
+        << Bytes << " byte vector/matrix) written to here:\n";
+    NotDecomposingReportInst = nullptr;
+  }
+  if (!Inst)
+    Inst = NotDecomposingReportInst;
+  assert(Inst);
+  if (Inst->getDebugLoc())
+    Inst = Inst->getParent()->getFirstNonPHI();
+  reportLocation(Inst->getContext(), Inst->getDebugLoc(), dbgs());
+  dbgs() << ": vector decomposition failed because: " << Text << "\n";
+}
+
+/***********************************************************************
+ * VectorDecomposer::decompose : decompose web of vectors in Web based on
+ *  Decomposition[] and Offsets[]
+ */
+void VectorDecomposer::decompose()
+{
+  // For each phi node in the web, create a phi node for each decomposed
+  // part, with all incomings set to the decomposed part of the original
+  // incoming if it was constant, otherwise undef.
+  for (auto wi = Web.begin(), we = Web.end(); wi != we; ++wi) {
+    auto Phi = dyn_cast<PHINode>(*wi);
+    if (!Phi)
+      continue;
+    auto PhiPartsEntry = &PhiParts[Phi];
+    auto Undef = UndefValue::get(Phi->getType());
+    unsigned NumIncomings = Phi->getNumIncomingValues();
+    for (unsigned PartIndex = 0; PartIndex != Offsets.size(); ++PartIndex) {
+      auto PartTy = getPartType(Phi->getType(), PartIndex);
+      auto NewPhi = PHINode::Create(PartTy, NumIncomings,
+          Phi->getName() + ".decomp." + Twine(PartIndex), Phi);
+      for (unsigned ii = 0; ii != NumIncomings; ++ii) {
+        auto Incoming = dyn_cast<Constant>(Phi->getIncomingValue(ii));
+        if (!Incoming)
+          Incoming = Undef;
+        Incoming = getConstantPart(Incoming, PartIndex);
+        NewPhi->addIncoming(Incoming, Phi->getIncomingBlock(ii));
+      }
+      NewInsts.push_back(NewPhi);
+      PhiPartsEntry->push_back(NewPhi);
+    }
+  }
+  // Shorten the list of instructions in Web so it only includes phi nodes
+  // and start wrregions (ones with constant input). We need to do this first
+  // because other instructions in the web may become erased so checking them
+  // in the "decompose each tree of values" loop is invalid.
+  unsigned NewLen = 0;
+  for (unsigned wi = 0, we = Web.size(); wi != we; ++wi) {
+    Instruction *Inst = Web[wi];
+    if (isa<PHINode>(Inst) || (GenXIntrinsic::isWrRegion(Inst)
+          && isa<Constant>(Inst->getOperand(0))))
+      Web[NewLen++] = Inst;
+  }
+  Web.resize(NewLen);
+  // Decompose each tree of values in the web rooted at a start wrregion (one
+  // with constant input) or at each use of a phi node. Each tree can be
+  // done independently, as we have already put the phi nodes in place to link
+  // them together.
+  for (auto wi = Web.begin(), we = Web.end(); wi != we; ++wi) {
+    Instruction *Inst = *wi;
+    if (auto Phi = dyn_cast<PHINode>(Inst)) {
+      auto Parts = &PhiParts[Phi];
+      // decomposeTree removes the use, so we repeatedly process the first use
+      // until they have all gone.
+      while (!Phi->use_empty())
+        decomposeTree(&*Phi->use_begin(), Parts);
+    } else {
+      assert (GenXIntrinsic::isWrRegion(Inst) && isa<Constant>(Inst->getOperand(0)));
+      decomposeTree(&Inst->getOperandUse(0), nullptr);
+    }
+  }
+  // Erase original phi nodes. (The other original instructions in the web have
+  // been erased already.)
+  for (auto pi = PhiParts.begin(), pe = PhiParts.end(); pi != pe; ++pi)
+    eraseInst(pi->first);
+  // Do an aggressive dead code removal pass on instructions that we have added.
+  removeDeadCode();
+}
+
+/***********************************************************************
+ * VectorDecomposer::decomposeTree : decompose vectors in a tree
+ *
+ * Enter:   U = use at the root of the tree, one of:
+ *              - the "old value" operand of wrregion (might be constant)
+ *              - the "old value" operand of rdregion
+ *              - the input of bitcast
+ *              - a phi incoming
+ *          PartsIn = decomposed parts of input (not modifiable)
+ *                    (0 if *U is constant)
+ *
+ * This is a tree of wrregion and bitcast instructions, with phi node uses
+ * and rdregions at the leaves.
+ *
+ * This function traverses the tree using self recursion.
+ */
+void VectorDecomposer::decomposeTree(Use *U,
+    const SmallVectorImpl<Value *> *PartsIn)
+{
+  auto Inst = cast<Instruction>(U->getUser());
+  if (auto Phi = dyn_cast<PHINode>(Inst)) {
+    decomposePhiIncoming(Phi, U->getOperandNo(), PartsIn);
+    return;
+  }
+  assert(!U->getOperandNo());
+  if (GenXIntrinsic::isRdRegion(Inst)) {
+    decomposeRdRegion(Inst, PartsIn);
+    return;
+  }
+  // Set up the decomposed parts of the incoming value.
+  SmallVector<Value *, 8> Parts;
+  if (PartsIn)
+    for (unsigned i = 0, e = PartsIn->size(); i != e; ++i)
+      Parts.push_back((*PartsIn)[i]);
+  else
+    for (unsigned i = 0, e = Offsets.size(); i != e; ++i)
+      Parts.push_back(getConstantPart(cast<Constant>(*U), i));
+  // Handle bitcast.
+  if (isa<BitCastInst>(Inst)) {
+    decomposeBitCast(Inst, &Parts);
+    return;
+  }
+  // Handle wrregion.
+  assert(GenXIntrinsic::isWrRegion(Inst));
+  decomposeWrRegion(Inst, &Parts);
+}
+
+/***********************************************************************
+ * VectorDecomposer::decomposePhiIncoming : decompose a use in a phi node
+ *
+ * Enter:   Phi = the phi node
+ *          OperandNum = operand number in the phi node
+ *          PartsIn = decomposed parts of input (not modifiable)
+ */
+void VectorDecomposer::decomposePhiIncoming(PHINode *Phi, unsigned OperandNum,
+    const SmallVectorImpl<Value *> *PartsIn)
+{
+  // For each part, find the decomposed phi node and set its
+  // corresponding incoming.
+  auto PhiPartsEntry = &PhiParts[Phi];
+  for (unsigned PartIndex = 0, NumParts = PartsIn->size();
+      PartIndex != NumParts; ++PartIndex) {
+    auto PhiPart = cast<PHINode>((*PhiPartsEntry)[PartIndex]);
+    PhiPart->setIncomingValue(OperandNum, (*PartsIn)[PartIndex]);
+  }
+  // Set the incoming in the original phi node to undef, to remove the use.
+  Phi->setIncomingValue(OperandNum, UndefValue::get(Phi->getType()));
+}
+
+/***********************************************************************
+ * VectorDecomposer::decomposeRdRegion : decompose a rdregion
+ *
+ * Enter:   RdRegion = the rdregion instruction
+ *          PartsIn = decomposed parts of input (not modifiable)
+ */
+void VectorDecomposer::decomposeRdRegion(Instruction *RdRegion,
+    const SmallVectorImpl<Value *> *PartsIn)
+{
+  Region RdR(RdRegion, BaleInfo());
+  unsigned PartIndex = getPartIndex(&RdR);
+  Value *Part = (*PartsIn)[PartIndex];
+  if (isa<UndefValue>(Part)) {
+    // Check if this region read is used as a two addr operand.
+    auto isUsedInTwoAddr = [](Value *V) {
+      for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) {
+        auto user = cast<Instruction>(ui->getUser());
+        if (auto CI = dyn_cast<CallInst>(user)) {
+          if (getTwoAddressOperandNum(CI) == (int)ui->getOperandNo())
+            return true;
+        }
+      }
+      return false;
+    };
+
+    // Do not emit a warning if this undef is being used as old value
+    // in a two-addr instruction.
+    if (!isUsedInTwoAddr(RdRegion)) {
+      if (auto N = getVariable(cast<IntrinsicInst>(RdRegion))) {
+        emitWarning(RdRegion, "undefined value from '" + N->getName() +
+                              "' is referenced after decomposition");
+      } else
+        emitWarning(RdRegion,
+                    "undefined value is referenced after decomposition");
+    }
+  }
+  if (RdRegion->getType() == Part->getType() && RdR.isContiguous()
+      && isa<VectorType>(RdRegion->getType())) {
+    // The rdregion reads the whole of the decomposed part of the vector (and
+    // has a vector result even if single element).
+    // Just replace uses and erase.
+    RdRegion->replaceAllUsesWith(Part);
+    eraseInst(RdRegion);
+    return;
+  }
+  // The rdregion reads only some of the decomposed part of the vector.
+  // Create a new rdregion to replace the old one, taking its name.
+  RdR.Offset -= getPartOffset(PartIndex);
+  auto NewRdRegion = RdR.createRdRegion(Part,
+      "", RdRegion, RdRegion->getDebugLoc(),
+      /*AllowScalar=*/!isa<VectorType>(RdRegion->getType()));
+  NewRdRegion->takeName(RdRegion);
+  RdRegion->replaceAllUsesWith(NewRdRegion);
+  assert(Seen.find(RdRegion) == Seen.end());
+  eraseInst(RdRegion);
+}
+
+/***********************************************************************
+ * VectorDecomposer::decomposeWrRegion : decompose a wrregion
+ *
+ * Enter:   WrRegion = the wrregion instruction
+ *          Parts = decomposed parts of input (modifiable)
+ */
+void VectorDecomposer::decomposeWrRegion(Instruction *WrRegion,
+    SmallVectorImpl<Value *> *Parts)
+{
+  Region WrR(WrRegion, BaleInfo());
+  unsigned PartIndex = getPartIndex(&WrR);
+  Value *Part = (*Parts)[PartIndex];
+  if (WrRegion->getOperand(NewValueOperandNum)->getType() == Part->getType()
+      && !WrR.Mask) {
+    // The wrregion writes the whole of the decomposed part of the vector.
+    // We can just directly replace the part.
+    (*Parts)[PartIndex] = WrRegion->getOperand(NewValueOperandNum);
+  } else {
+    // The wrregion writes only some of the decomposed part of the vector.
+    // Create a new wrregion.
+    WrR.Offset -= getPartOffset(PartIndex);
+    auto NewInst = cast<Instruction>(WrR.createWrRegion(Part,
+        WrRegion->getOperand(NewValueOperandNum),
+        WrRegion->getName() + ".decomp." + Twine(PartIndex),
+        WrRegion, WrRegion->getDebugLoc()));
+    (*Parts)[PartIndex] = NewInst;
+    NewInsts.push_back(NewInst);
+  }
+  // Decompose its uses. decomposeTree removes the use, so we repeatedly process
+  // the first use until they have all gone.
+  while (!WrRegion->use_empty())
+    decomposeTree(&*WrRegion->use_begin(), Parts);
+  // Now the original wrregion has no uses, and we can remove it.
+  eraseInst(WrRegion);
+}
+
+/***********************************************************************
+ * VectorDecomposer::decomposeBitCast : decompose a bitcast
+ *
+ * Enter:   Inst = the bitcast instruction
+ *          Parts = decomposed parts of input (modifiable)
+ */
+void VectorDecomposer::decomposeBitCast(Instruction *Inst,
+    SmallVectorImpl<Value *> *Parts)
+{
+  // Create a new bitcast for each decomposed part, other than when the part
+  // is undef. (We handle the undef case as it is common, when only some of the
+  // vector has been set up. Other constant cases we leave to the EarlyCSE pass
+  // that comes after this pass.)
+  for (unsigned PartIndex = 0, NumParts = Parts->size();
+      PartIndex != NumParts; ++PartIndex) {
+    Type *NewTy = getPartType(Inst->getType(), PartIndex);
+    if (isa<UndefValue>((*Parts)[PartIndex]))
+      (*Parts)[PartIndex] = UndefValue::get(NewTy);
+    else {
+      auto NewInst = CastInst::Create(Instruction::BitCast, (*Parts)[PartIndex],
+          NewTy, Inst->getName() + ".decomp." + Twine(PartIndex), Inst);
+      NewInst->setDebugLoc(Inst->getDebugLoc());
+      NewInsts.push_back(NewInst);
+      (*Parts)[PartIndex] = NewInst;
+    }
+  }
+  // Decompose its uses. decomposeTree removes the use, so we repeatedly process
+  // the first use until they have all gone.
+  while (!Inst->use_empty())
+    decomposeTree(&*Inst->use_begin(), Parts);
+  // Now the original wrregion has no uses, and we can remove it.
+  eraseInst(Inst);
+}
+
+/***********************************************************************
+ * VectorDecomposer::getPartIndex : get the part index for the region
+ */
+unsigned VectorDecomposer::getPartIndex(Region *R)
+{
+  return Decomposition[R->Offset / 32U];
+}
+
+/***********************************************************************
+ * VectorDecomposer::getPartOffset : get the byte offset of a part
+ */
+unsigned VectorDecomposer::getPartOffset(unsigned PartIndex)
+{
+  // Offsets[] has the index in GRFs.
+  return Offsets[PartIndex] * 32;
+}
+
+/***********************************************************************
+ * VectorDecomposer::getPartNumBytes : get the size of a part in bytes
+ */
+unsigned VectorDecomposer::getPartNumBytes(Type *WholeTy, unsigned PartIndex)
+{
+  if (PartIndex + 1 != Offsets.size()) {
+    // Not the last part. We can use the offset (in GRFs) difference.
+    return 32 * (Offsets[PartIndex + 1] - Offsets[PartIndex]);
+  }
+  // For the last part, we need to get the total size from WholeTy.
+  return DL->getTypeSizeInBits(WholeTy) / 8U - 32 * Offsets[PartIndex];
+}
+
+/***********************************************************************
+ * VectorDecomposer::getPartNumElements : get the size of a part in elements
+ */
+unsigned VectorDecomposer::getPartNumElements(Type *WholeTy, unsigned PartIndex)
+{
+  Type *ElementTy = WholeTy->getScalarType();
+  return getPartNumBytes(WholeTy, PartIndex)
+      / (DL->getTypeSizeInBits(ElementTy) >> 3);
+}
+
+/***********************************************************************
+ * VectorDecomposer::getPartType : get the type of a part
+ */
+VectorType *VectorDecomposer::getPartType(Type *WholeTy, unsigned PartIndex)
+{
+  Type *ElementTy = WholeTy->getScalarType();
+  return VectorType::get(ElementTy, getPartNumElements(WholeTy, PartIndex));
+}
+
+/***********************************************************************
+ * VectorDecomposer::getConstantPart : get the decomposed part of a constant
+ */
+Constant *VectorDecomposer::getConstantPart(Constant *Whole, unsigned PartIndex)
+{
+  Region R(Whole, DL);
+  R.Offset = getPartOffset(PartIndex);
+  R.NumElements = R.Width = getPartNumElements(Whole->getType(), PartIndex);
+  return R.evaluateConstantRdRegion(Whole, /*AllowScalar=*/false);
+}
+
+/***********************************************************************
+ * VectorDecomposer::removeDeadCode : aggressive dead code removal on
+ *    instructions added by the vector decomposer
+ *
+ * NewInsts contains the instructions added.
+ */
+void VectorDecomposer::removeDeadCode()
+{
+  SmallVector<Instruction *, 8> Stack; // the "to be processed" stack
+  std::set<Instruction *> Unused;
+  // Put all newly added instructions into the Unused set.
+  for (auto i = NewInsts.begin(), e = NewInsts.end(); i != e; ++i)
+    Unused.insert(*i);
+  // Look at each newly added instruction. If it is used in anything other than
+  // one of our newly added instructions, add it to the "to be processed" stack
+  // and remove it from the Unused set. (It also counts as used an instruction
+  // that is used in another of our newly added instructions that happens to
+  // have already been seen as used. It doesn't matter either way that this
+  // happens.)
+  for (auto i = NewInsts.begin(), e = NewInsts.end(); i != e; ++i) {
+    Instruction *Inst = *i;
+    bool IsUsed = false;
+    for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+      auto user = cast<Instruction>(ui->getUser());
+      if (Unused.find(user) == Unused.end())
+        IsUsed = true;
+    }
+    if (IsUsed) {
+      Stack.push_back(Inst);
+      Unused.erase(Inst);
+    }
+  }
+  // Process each entry on the stack.
+  while (!Stack.empty()) {
+    Instruction *Inst = Stack.back();
+    Stack.pop_back();
+    // Inst is used, perhaps indirectly, by something outside the web.
+    // Mark instructions it uses as used. For wrregion and bitcast, this
+    // is just operand 0. For a phi node, it is all incomings.
+    if (auto Phi = dyn_cast<PHINode>(Inst)) {
+      for (unsigned ii = 0, ie = Phi->getNumIncomingValues(); ii != ie; ++ii) {
+        auto Incoming = dyn_cast<Instruction>(Phi->getIncomingValue(ii));
+        auto it = Unused.find(Incoming);
+        if (it == Unused.end())
+          continue;
+        // Incoming is an instruction currently in the unused set. Remove it
+        // from the set, and add it to the "to be processed" stack.
+        Unused.erase(it);
+        Stack.push_back(Incoming);
+      }
+    } else {
+      auto Operand = dyn_cast<Instruction>(Inst->getOperand(0));
+      auto it = Unused.find(Operand);
+      if (it != Unused.end()) {
+        // Operand is an instruction currently in the unused set. Remove it
+        // from the set, and add it to the "to be processed" stack.
+        Unused.erase(it);
+        Stack.push_back(Operand);
+      }
+    }
+  }
+  // Anything left in Unused is really unused, except for uses by other
+  // instructions in Unused (possibly circularly in the case of phi nodes).
+  // Erase them all forcibly, by changing all uses to undef first.
+  for (auto uui = Unused.begin(), uue = Unused.end(); uui != uue; ++uui) {
+    Instruction *Inst = *uui;
+    while (!Inst->use_empty())
+      *Inst->use_begin() = UndefValue::get((*Inst->use_begin())->getType());
+    eraseInst(Inst);
+  }
+}
+
+/***********************************************************************
+ * VectorDecomposer::eraseInst : erase an instruction
+ *
+ * This is used in the case that the instruction might be in the Seen
+ * set. So we delay actually deleting it until the end of processing the
+ * function.
+ */
+void VectorDecomposer::eraseInst(Instruction *Inst)
+{
+  Inst->removeFromParent();
+  ToDelete.push_back(Inst);
+  // Remove all non-constant operands.
+  for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) {
+    Value *Opnd = Inst->getOperand(i);
+    if (isa<Constant>(Opnd))
+      continue;
+    Inst->setOperand(i, UndefValue::get(Opnd->getType()));
+  }
+}
+
+void VectorDecomposer::emitWarning(Instruction *Inst, const Twine &Msg) {
+  DiagnosticVectorDecomposition Warn(Inst, Msg, DS_Warning);
+  Inst->getContext().diagnose(Warn);
+}
+
+// Decompose
+//
+// %33 = fcmp une <24 x float> %25, zeroinitializer
+// %34 = fcmp oeq <24 x float> %24, zeroinitializer
+// %35 = and <24 x i1> %33, %34
+// %36 = select <24 x i1> %35, <24 x float> <float 0x4071E7A300000000, >
+//
+// into
+//
+// %25.0 = rrd(%25, 16, 16, 1)
+// %25.1 = rrd(%25, 8, 8, 1)
+// %24.0 = rrd(%24, 16, 16, 1)
+// %24.1 = rrd(%24, 8, 8, 1)
+// %33.0 = fcmp une <16 x float> %25.0, zeroinitializer
+// %33.1 = fcmp une <8 x float> %25.1, zeroinitializer
+// %34.0 = fcmp oeq <16 x float> %24.0, zeroinitializer
+// %34.1 = fcmp oeq <8 x float> %24.1, zeroinitializer
+// %35.0 = and <16 x i1> %33.0, %34.0
+// %35.1 = and <8 x i1> %33.1, %34.1
+// $36.0 = select <16 x i1> %35.0, <16 x float> <float 0x4071E7A300000000, >
+// %36.1 = select <8 x i1> %35.1, <8 x float> <float 0x4071E7A300000000, >
+// %36.new.0 = wrr(<24 x float> undef, <16 x float> %36.0, 0)
+// %36.new.1 = wwr(<24 x float> %36.new.0, <8 x float> %36.1, 16)
+//
+// This allows register pressure reducer to better reorder the above sequence.
+//
+bool SelectDecomposer::run() {
+  bool Modified = false;
+  for (auto Inst : StartSelects) {
+    Modified |= processStartSelect(Inst);
+    clear();
+  }
+  return Modified;
+}
+
+bool SelectDecomposer::processStartSelect(Instruction *Inst) {
+  auto SI = dyn_cast<SelectInst>(Inst);
+  if (!SI || !determineDecomposition(Inst))
+    return false;
+
+  // Decompose it and its predicate computation recursively.
+  decompose(SI);
+
+  // Merge components, starting with undef.
+  SmallVectorImpl<Value *> &Parts = DMap[Inst];
+  Value *NewInst = UndefValue::get(Inst->getType());
+  for (unsigned Idx = 0, N = Decomposition.size(); Idx < N; ++Idx) {
+    Region R(NewInst);
+    R.getSubregion(getPartOffset(Idx), getPartNumElements(Idx));
+    NewInst = R.createWrRegion(NewInst, Parts[Idx], ".join", Inst,
+                               Inst->getDebugLoc());
+  }
+  Inst->replaceAllUsesWith(NewInst);
+  return true;
+}
+
+template <typename T> bool isGlobalVarOperand(const Value *V) {
+  const T *Inst = dyn_cast<T>(V);
+  return Inst &&
+         getUnderlyingGlobalVariable(Inst->getPointerOperand()) != nullptr;
+}
+
+bool SelectDecomposer::determineDecomposition(Instruction *Inst) {
+  auto SI = dyn_cast<SelectInst>(Inst);
+  assert(SI && "select expected");
+  VectorType *Ty = dyn_cast<VectorType>(SI->getCondition()->getType());
+  if (!Ty)
+    return false;
+  unsigned NumElts = Ty->getNumElements();
+  if (NumElts <= 16)
+    return false;
+  if (!isa<Instruction>(SI->getCondition()))
+    return false;
+
+  // Disable select decomposition if this select may be used in g_store bale.
+  // Otherwise, g_store bale cannot be created correctly due to a missing load
+  // of a global that will be stored(it is one of the requirements to g_store
+  // bales). The change fixes FRC_global and FRC_MC_global tests.
+  if (std::any_of(Inst->user_begin(), Inst->user_end(),
+                  isGlobalVarOperand<StoreInst>) ||
+      std::any_of(Inst->value_op_begin(), Inst->value_op_end(),
+                  isGlobalVarOperand<LoadInst>))
+    return false;
+
+  // Extra checks to avoid aggressive splitting.
+  auto BB = Inst->getParent();
+  auto check = [=](Instruction *I) {
+    if (!I->hasOneUse() || I->getParent() != BB) {
+      setNotDecomposing();
+      return false;
+    }
+    return true;
+  };
+
+  // This determines the width of predicate operands.
+  // We consider the following two factors
+  // - The type size of sel
+  // - The input operands
+  unsigned Width = GenXDefaultSelectPredicateWidth;
+  if (Width > 32)
+     Width = 32;
+  else if (Width < 16)
+    Width = 16;
+  else if (SI->getType()->getScalarSizeInBits() >= 32)
+    Width = 16;
+
+  // If there is a region read with a non-unit stride,
+  // then adjust the splitting width appropriately.
+  auto adjustWidth = [=, &Width](Value *V) {
+    // If this region read only supports up to 16, then do not split into
+    // simd 32. Otherwise it makes difficult to bale in this region read.
+    if (Width == 32 && GenXIntrinsic::isRdRegion(V)) {
+      CallInst *CI = cast<CallInst>(V);
+      Region R(CI, BaleInfo());
+      unsigned LegalSize = R.getLegalSize(
+          0, true /*Allow2D*/,
+          CI->getOperand(0)->getType()->getVectorNumElements(), ST);
+      if (LegalSize < 32)
+        Width = 16;
+    }
+  };
+
+  addToWeb(SI->getCondition());
+  for (unsigned i = 0; i != Web.size(); ++i) {
+    Inst = Web[i];
+    if (!check(Inst))
+      break;
+    unsigned OpCode = Inst->getOpcode();
+    switch (OpCode)
+    {
+    default:
+      setNotDecomposing();
+      break;
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      addToWeb(Inst->getOperand(0));
+      addToWeb(Inst->getOperand(1));
+      adjustWidth(Inst->getOperand(0));
+      adjustWidth(Inst->getOperand(1));
+      break;
+    case Instruction::FCmp:
+    case Instruction::ICmp:
+      adjustWidth(Inst->getOperand(0));
+      adjustWidth(Inst->getOperand(1));
+      break;
+    }
+  }
+
+  if (NotDecomposing)
+    return false;
+
+  Offsets.clear();
+  unsigned Offset = 0;
+  unsigned Remaining = NumElts;
+  while (Remaining > Width) {
+    Decomposition.push_back(Width);
+    Offsets.push_back(Offset);
+    Remaining -= Width;
+    Offset += Width;
+  }
+  if (Remaining > 0) {
+    Decomposition.push_back(Remaining);
+    Offsets.push_back(Offset);
+  }
+#if _DEBUG
+  unsigned NumParts = (NumElts + Width - 1) / Width;
+  assert(NumParts == Decomposition.size());
+  assert(NumParts == Offsets.size());
+#endif
+
+  return true;
+}
+
+void SelectDecomposer::addToWeb(Value *V) {
+  if (isa<Constant>(V))
+    return;
+  auto Inst = dyn_cast<Instruction>(V);
+  if (!Inst) {
+    // Cannot decompose with an argument in the web.
+    setNotDecomposing();
+    return;
+  }
+  if (Seen.find(Inst) != Seen.end())
+    return;
+
+  Seen.insert(Inst);
+  Web.push_back(Inst);
+}
+
+void SelectDecomposer::decompose(Instruction *Inst) {
+  if (isa<SelectInst>(Inst))
+    decomposeSelect(Inst);
+  else if (isa<CmpInst>(Inst))
+    decomposeCmp(Inst);
+  else {
+    assert(Inst->getOpcode() == Instruction::And ||
+           Inst->getOpcode() == Instruction::Or ||
+           Inst->getOpcode() == Instruction::Xor);
+    decomposeBinOp(Inst);
+  }
+}
+
+void SelectDecomposer::decomposeSelect(Instruction *Inst) {
+  SelectInst *SI = cast<SelectInst>(Inst);
+  if (auto I = dyn_cast<Instruction>(SI->getCondition()))
+    decompose(I);
+
+  unsigned N = Decomposition.size();
+  SmallVector<Value *, 8> Parts(N);
+  IRBuilder<> B(Inst);
+
+  Value *OpC = SI->getCondition();
+  Value *OpT = SI->getTrueValue();
+  Value *OpF = SI->getFalseValue();
+
+  for (unsigned Idx = 0; Idx < N; ++Idx) {
+    Value *OpC_I = getPart(OpC, Idx, Inst);
+    Value *OpT_I = getPart(OpT, Idx, Inst);
+    Value *OpF_I = getPart(OpF, Idx, Inst);
+    Value *NewInst = B.CreateSelect(OpC_I, OpT_I, OpF_I, Inst->getName());
+    if (auto I = dyn_cast<Instruction>(NewInst))
+      I->setDebugLoc(Inst->getDebugLoc());
+    Parts[Idx] = NewInst;
+  }
+
+  DMap[Inst].swap(Parts);
+}
+
+void SelectDecomposer::decomposeBinOp(Instruction *Inst) {
+  Value *Op0 = Inst->getOperand(0);
+  Value *Op1 = Inst->getOperand(1);
+  if (auto I = dyn_cast<Instruction>(Op0))
+    decompose(I);
+  if (auto I = dyn_cast<Instruction>(Op1))
+    decompose(I);
+
+  unsigned N = Decomposition.size();
+  SmallVector<Value *, 8> Parts(N);
+  IRBuilder<> B(Inst);
+
+  for (unsigned Idx = 0; Idx < N; ++Idx) {
+    Value *Op0_I = getPart(Op0, Idx, Inst);
+    Value *Op1_I = getPart(Op1, Idx, Inst);
+    Value *NewInst = B.CreateBinOp(Instruction::BinaryOps(Inst->getOpcode()),
+                                   Op0_I, Op1_I, Inst->getName());
+    if (auto I = dyn_cast<Instruction>(NewInst))
+      I->setDebugLoc(Inst->getDebugLoc());
+    Parts[Idx] = NewInst;
+  }
+
+  DMap[Inst].swap(Parts);
+}
+
+void SelectDecomposer::decomposeCmp(Instruction *Inst) {
+  Value *Op0 = Inst->getOperand(0);
+  Value *Op1 = Inst->getOperand(1);
+
+  unsigned N = Decomposition.size();
+  SmallVector<Value *, 8> Parts(N);
+  IRBuilder<> B(Inst);
+  CmpInst *CI = cast<CmpInst>(Inst);
+
+  for (unsigned Idx = 0; Idx < N; ++Idx) {
+    Value *Op0_I = getPart(Op0, Idx, Inst);
+    Value *Op1_I = getPart(Op1, Idx, Inst);
+    Value *NewInst = nullptr;
+    if (isa<ICmpInst>(CI))
+      NewInst = B.CreateICmp(CI->getPredicate(), Op0_I, Op1_I, Inst->getName());
+    else
+      NewInst = B.CreateFCmp(CI->getPredicate(), Op0_I, Op1_I, Inst->getName());
+    if (auto I = dyn_cast<Instruction>(NewInst))
+      I->setDebugLoc(Inst->getDebugLoc());
+    Parts[Idx] = NewInst;
+  }
+
+  DMap[Inst].swap(Parts);
+}
+
+Value *SelectDecomposer::getPart(Value *Whole, unsigned PartIndex,
+                                 Instruction *Inst) const {
+  auto I = DMap.find(Whole);
+  if (I != DMap.end()) {
+    assert(I->second.size() > PartIndex);
+    return I->second[PartIndex];
+  }
+
+  unsigned Offset = getPartOffset(PartIndex);
+  unsigned NumElts = getPartNumElements(PartIndex);
+
+  if (Whole->getType()->getScalarType()->isIntegerTy(1)) {
+    auto C = dyn_cast<Constant>(Whole);
+    assert(C && "constant expected");
+    if (Constant *V = C->getSplatValue())
+      return ConstantVector::getSplat(NumElts, V);
+    SmallVector<Constant *, 8> Values;
+    for (unsigned Idx = Offset; Idx < Offset + NumElts; ++Idx)
+      Values.push_back(C->getAggregateElement(Idx));
+    return ConstantVector::get(Values);
+  }
+
+  const DataLayout &DL = Inst->getModule()->getDataLayout();
+  Region R(Whole, &DL);
+  R.Offset = Offset * R.ElementBytes;
+  R.NumElements = R.Width = NumElts;
+
+  if (auto C = dyn_cast<Constant>(Whole))
+    return R.evaluateConstantRdRegion(C, /*AllowScalar=*/false);
+  return R.createRdRegion(Whole, ".in", Inst, Inst->getDebugLoc());
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h
new file mode 100644
index 000000000000..c7bf04e31e14
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h
@@ -0,0 +1,175 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXVectorDecomposer
+/// --------------------
+///
+/// GenXVectorDecomposer is not a pass; instead it is a class is called by by
+/// the GenXPostLegalization pass to perform vector decomposition.
+///
+/// For a vector written by wrregion and read by rdregion, it finds the way that
+/// the vector can be divided into parts, with each part a range of one or more
+/// GRFs, such that no rdregion or wrregion crosses a part boundary. Then it
+/// decomposes the vector into those parts. A rdregion/wrregion that reads/writes
+/// a whole part can be removed completely; a rdregion/wrregion that reads/writes
+/// only some of the part is replaced to read/write just the applicable part.
+///
+/// In fact it does all this for a web of vectors linked by wrregion, phi nodes
+/// and bitcasts.
+///
+/// The idea is that having lots of small vectors instead of one big vector
+/// reduces register fragmentation in the finalizer's register allocator.
+///
+/// There is an option -limit-genx-vector-decomposer=N to aid debugging the code
+/// changes made by the vector decomposer.
+///
+//===----------------------------------------------------------------------===//
+#include "GenXRegion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instructions.h"
+#include <map>
+#include <set>
+
+namespace llvm {
+
+class Constant;
+class DominatorTree;
+class Instruction;
+class PHINode;
+class Type;
+class Use;
+
+namespace gen {
+class Region;
+}
+
+// VectorDecomposer : decomposes vectors in a function
+class VectorDecomposer {
+  DominatorTree *DT;
+  const DataLayout *DL = nullptr;
+  SmallVector<Instruction *, 16> StartWrRegions;
+  std::set<Instruction *> Seen;
+  SmallVector<Instruction *, 16> Web;
+  SmallVector<Instruction *, 16> ToDelete;
+  bool NotDecomposing;
+  Instruction *NotDecomposingReportInst;
+  SmallVector<unsigned, 8> Decomposition;
+  SmallVector<unsigned, 8> Offsets;
+  std::map<PHINode *, SmallVector<Value *, 8>> PhiParts;
+  SmallVector<Instruction *, 8> NewInsts;
+public:
+  // clear : clear anything stored
+  void clear() {
+    clearOne();
+    StartWrRegions.clear();
+    Seen.clear();
+    ToDelete.clear();
+  }
+  // addStartWrRegion : add a wrregion with undef input to the list
+  void addStartWrRegion(Instruction *Inst) { StartWrRegions.push_back(Inst); }
+  // run : run the vector decomposer on the stored StartWrRegions
+  bool run(DominatorTree *DT);
+private:
+  // clearOne : clear from processing one web
+  void clearOne() {
+    Web.clear();
+    Decomposition.clear();
+    Offsets.clear();
+    PhiParts.clear();
+    NewInsts.clear();
+  }
+  bool processStartWrRegion(Instruction *Inst);
+  bool determineDecomposition(Instruction *Inst);
+  void addToWeb(Value *V, Instruction *User = nullptr);
+  void adjustDecomposition(Instruction *Inst);
+  void setNotDecomposing(Instruction *Inst, const char *Text);
+  void decompose();
+  void decomposeTree(Use *U, const SmallVectorImpl<Value *> *PartsIn);
+  void decomposePhiIncoming(PHINode *Phi, unsigned OperandNum,
+                            const SmallVectorImpl<Value *> *PartsIn);
+  void decomposeRdRegion(Instruction *RdRegion,
+                         const SmallVectorImpl<Value *> *PartsIn);
+  void decomposeWrRegion(Instruction *WrRegion, SmallVectorImpl<Value *> *Parts);
+  void decomposeBitCast(Instruction *Inst, SmallVectorImpl<Value *> *Parts);
+  unsigned getPartIndex(genx::Region *R);
+  unsigned getPartOffset(unsigned PartIndex);
+  unsigned getPartNumBytes(Type *WholeTy, unsigned PartIndex);
+  unsigned getPartNumElements(Type *WholeTy, unsigned PartIndex);
+  VectorType *getPartType(Type *WholeTy, unsigned PartIndex);
+  Constant *getConstantPart(Constant *Whole, unsigned PartIndex);
+  void removeDeadCode();
+  void eraseInst(Instruction *Inst);
+
+  void emitWarning(Instruction *Inst, const Twine &Msg);
+};
+
+// Decompose predicate computation sequences for select
+// to reduce flag register pressure.
+class SelectDecomposer {
+  const GenXSubtarget *ST;
+  bool NotDecomposing = false;
+  SmallVector<Instruction *, 8> StartSelects;
+  SmallVector<Instruction *, 16> Web;
+  SmallVector<unsigned, 8> Decomposition;
+  SmallVector<unsigned, 8> Offsets;
+  std::set<Instruction *> Seen;
+
+  // Map each decomposed instructions to its corresonding part values.
+  SmallDenseMap<Value *, SmallVector<Value *, 8>> DMap;
+
+public:
+  explicit SelectDecomposer(const GenXSubtarget *ST) : ST(ST) {}
+  void addStartSelect(Instruction *Inst) { StartSelects.push_back(Inst); }
+  bool run();
+
+private:
+  void clear() {
+    NotDecomposing = false;
+    Web.clear();
+    Decomposition.clear();
+    Offsets.clear();
+    Seen.clear();
+    DMap.clear();
+  }
+  bool processStartSelect(Instruction *Inst);
+  bool determineDecomposition(Instruction* Inst);
+  void setNotDecomposing() { NotDecomposing = true; }
+  void addToWeb(Value *V);
+  void decompose(Instruction *Inst);
+  void decomposeSelect(Instruction *Inst);
+  void decomposeBinOp(Instruction *Inst);
+  void decomposeCmp(Instruction *Inst);
+
+  unsigned getPartOffset(unsigned PartIndex) const {
+    return Offsets[PartIndex];
+  }
+  unsigned getPartNumElements(unsigned PartIndex) const {
+    return Decomposition[PartIndex];
+  }
+  Value *getPart(Value *Whole, unsigned PartIndex, Instruction *Inst) const;
+};
+
+} // end namespace llvm
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h
new file mode 100644
index 000000000000..7415a8411b2a
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h
@@ -0,0 +1,140 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file contains defines for vISA and the vISA writer.
+//
+//===----------------------------------------------------------------------===//
+#ifndef GENXVISA_H
+#define GENXVISA_H
+
+#include "GenX.h"
+#include "GenXBaling.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include <map>
+#include <string>
+#include <vector>
+#include "GenXModule.h"
+
+namespace llvm {
+  namespace visa {
+
+    // vISA relational operators
+    enum { EQ, NE, GT, GE, LT, LE };
+
+
+    enum {
+      CLASS_GENERAL, CLASS_ADDRESS, CLASS_PREDICATE, CLASS_INDIRECT,
+      CLASS_IMMEDIATE = 5, CLASS_STATE };
+
+    // vISA vector operand modifiers
+    enum { MOD_ABS = 0x8, MOD_NEG = 0x10, MOD_NEGABS = 0x18,
+      MOD_SAT = 0x20, MOD_NOT = 0x28 };
+
+    enum { VISA_NUM_RESERVED_REGS = 32,
+           VISA_NUM_RESERVED_PREDICATES = 1,
+           VISA_NUM_RESERVED_SURFACES = 6 };
+
+    // These reserved indices are used by CM Frontend
+    // and some passes (like TPM) to create an stateless/slm/stack accesses
+    // TODO: consider introducing as set of new intrinsics with explicit
+    // specification of access (to get rid of the relevant hacky code base).
+    enum ReservedSurfaceIndex {
+      RSI_Stack = 253, // 253 is for stack access (T1), used by TPM pass
+      RSI_Slm = 254, // 254 is SLM, which is T0 in vISA
+      RSI_Stateless = 255 // 255 is stateless, which is T5 in vISA
+    };
+
+    // Extracts surface Index (which is expected to be constant)
+    // from llvm::Value
+    // TODO: consider replacing dync_cast_or_null to dyn_cast
+    // TODO: rename convert->extract
+    inline int convertToSurfaceIndex(const Value* ValueIdx) {
+      if (const auto CI = dyn_cast_or_null<ConstantInt>(ValueIdx)) {
+        int InputValue = static_cast<int>(CI->getZExtValue());
+        return InputValue;
+      }
+      return -1;
+    }
+
+    inline ReservedSurfaceIndex getReservedSurfaceIndex(PreDefined_Surface Surface) {
+      switch(Surface) {
+      case PreDefined_Surface::PREDEFINED_SURFACE_STACK:
+        return RSI_Stack;
+      case PreDefined_Surface::PREDEFINED_SURFACE_SLM:
+        return RSI_Slm;
+      case PreDefined_Surface::PREDEFINED_SURFACE_T255:
+        return RSI_Stateless;
+      default:
+        // other types of prefefined surfaces are not used by CM backend
+        break;
+      }
+      llvm_unreachable("unsupported predefined surface");
+    }
+
+    inline bool isReservedSurfaceIndex(int SurfaceIndex) {
+      return SurfaceIndex == RSI_Stateless || SurfaceIndex == RSI_Slm ||
+             SurfaceIndex == RSI_Stack;
+    }
+
+    inline PreDefined_Surface getReservedSurface(int SurfaceIndex) {
+      assert(isReservedSurfaceIndex(SurfaceIndex));
+      switch(SurfaceIndex) {
+      case RSI_Stack:
+        return PreDefined_Surface::PREDEFINED_SURFACE_STACK;
+      case RSI_Slm:
+        return PreDefined_Surface::PREDEFINED_SURFACE_SLM;
+      case RSI_Stateless:
+        return PreDefined_Surface::PREDEFINED_SURFACE_T255;
+      }
+      llvm_unreachable("unexpected surface index");
+    }
+
+    enum { VISA_MAX_GENERAL_REGS = 65536 * 256 - 1,
+           VISA_MAX_ADDRESS_REGS = 4096,
+           VISA_MAX_PREDICATE_REGS = 4096,
+           VISA_MAX_SAMPLER_REGS = 32 - 1,
+           VISA_MAX_SURFACE_REGS = 256,
+           VISA_MAX_VME_REGS = 16 };
+
+    enum { VISA_WIDTH_GENERAL_REG = 32 };
+
+    enum { VISA_ABI_INPUT_REGS_RESERVED = 1,
+           VISA_ABI_INPUT_REGS_MAX = 128 };
+
+    enum InputVarType {
+      VISA_INPUT_GENERAL = 0x0,
+      VISA_INPUT_SAMPLER = 0x1,
+      VISA_INPUT_SURFACE = 0x2,
+      VISA_INPUT_UNKNOWN
+    };
+
+  } // end namespace Visa
+
+} // end namespace llvm
+#endif // ndef GENXVISA_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.cpp
new file mode 100644
index 000000000000..fd3a70a78866
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.cpp
@@ -0,0 +1,698 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// GenXVisaRegAlloc is a function group pass that allocates vISA registers to
+// LLVM IR values. See GenXVisaRegAlloc.h.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "GENX_REGALLOC"
+
+#include "GenXVisaRegAlloc.h"
+#include "GenX.h"
+#include "GenXIntrinsics.h"
+#include "GenXLiveness.h"
+#include "GenXNumbering.h"
+#include "GenXUtil.h"
+#include "visa_igc_common_header.h"
+#include "llvm/GenXIntrinsics/GenXMetadata.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace genx;
+using namespace visa;
+
+static cl::opt<unsigned> LimitGenXExtraCoalescing("limit-genx-extra-coalescing", cl::init(UINT_MAX), cl::Hidden,
+                                      cl::desc("Limit GenX extra coalescing."));
+
+
+char GenXVisaRegAlloc::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXVisaRegAlloc, "GenXVisaRegAlloc", "GenXVisaRegAlloc", false, false)
+INITIALIZE_PASS_DEPENDENCY(GenXLiveness)
+INITIALIZE_PASS_DEPENDENCY(GenXNumbering)
+INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis)
+INITIALIZE_PASS_END(GenXVisaRegAlloc, "GenXVisaRegAlloc", "GenXVisaRegAlloc", false, false)
+
+FunctionGroupPass *llvm::createGenXVisaRegAllocPass()
+{
+  initializeGenXVisaRegAllocPass(*PassRegistry::getPassRegistry());
+  return new GenXVisaRegAlloc();
+}
+
+void GenXVisaRegAlloc::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  FunctionGroupPass::getAnalysisUsage(AU);
+  AU.addRequired<GenXLiveness>();
+  AU.addRequired<GenXNumbering>();
+  AU.addRequired<FunctionGroupAnalysis>();
+  AU.setPreservesAll();
+}
+
+/***********************************************************************
+ * runOnFunctionGroup : run the register allocator for this FunctionGroup
+ *
+ * This is currently a trivial allocator that just gives a new vISA virtual
+ * register to every single Value.
+ */
+bool GenXVisaRegAlloc::runOnFunctionGroup(FunctionGroup &FGArg)
+{
+  FG = &FGArg;
+  Liveness = &getAnalysis<GenXLiveness>();
+  Numbering = &getAnalysis<GenXNumbering>();
+  FGA = &getAnalysis<FunctionGroupAnalysis>();
+  BoolTy = Type::getInt1Ty(FG->getContext());
+  // Empty out the analysis from the last function it was used on.
+  RegMap.clear();
+  RegStorage.clear();
+  PredefinedSurfaceRegs.clear();
+  for (unsigned i = 0; i != RegCategory::NUMREALCATEGORIES; ++i) {
+    CurrentRegId[i] = 0;
+  }
+  for (unsigned i = 0; i < VISA_NUM_RESERVED_SURFACES; ++i) {
+    RegStorage.emplace_back(RegCategory::SURFACE, i);
+    PredefinedSurfaceRegs.push_back(&RegStorage.back());
+  }
+  for (auto &F : *FG) {
+    if (F->hasFnAttribute(genx::FunctionMD::CMGenXMain) ||
+        F->hasFnAttribute(genx::FunctionMD::CMStackCall))
+      RegMap[F] = KernRegMap_t();
+  }
+  // Reserve the reserved registers.
+  CurrentRegId[RegCategory::GENERAL] = VISA_NUM_RESERVED_REGS;
+  CurrentRegId[RegCategory::PREDICATE] = VISA_NUM_RESERVED_PREDICATES;
+  CurrentRegId[RegCategory::SURFACE] = VISA_NUM_RESERVED_SURFACES;
+  // Do some extra coalescing.
+  extraCoalescing();
+  // Get the live ranges in a reproducible order.
+  std::vector<LiveRange *> LRs;
+  getLiveRanges(&LRs);
+  // Allocate a register to each live range.
+  for (auto i = LRs.begin(), e = LRs.end(); i != e; ++i)
+    allocReg(*i);
+  if (CurrentRegId[RegCategory::GENERAL] > VISA_MAX_GENERAL_REGS)
+    report_fatal_error("Too many vISA general registers");
+  if (CurrentRegId[RegCategory::ADDRESS] > VISA_MAX_ADDRESS_REGS)
+    report_fatal_error("Too many vISA address registers");
+  if (CurrentRegId[RegCategory::PREDICATE] > VISA_MAX_PREDICATE_REGS)
+    report_fatal_error("Too many vISA predicate registers");
+  if (CurrentRegId[RegCategory::SAMPLER] > VISA_MAX_SAMPLER_REGS)
+    report_fatal_error("Too many vISA sampler registers");
+  if (CurrentRegId[RegCategory::SURFACE] > VISA_MAX_SURFACE_REGS)
+    report_fatal_error("Too many vISA surface registers");
+  if (CurrentRegId[RegCategory::VME] > VISA_MAX_VME_REGS)
+    report_fatal_error("Too many vISA VME registers");
+  return false;
+}
+
+/***********************************************************************
+ * getLiveRanges : get the live ranges in a reproducible order
+ *
+ * We scan the code to find the live ranges, rather than just walking the
+ * GenXLiveness map, to ensure that registers are allocated in a consistent
+ * order that does not depend on the layout of allocated memory.
+ *
+ * This ignores any live range with no category, so such a live range does not
+ * get allocated a register. GenXArgIndirection uses that to stop an indirected
+ * argument uselessly getting a register.
+ */
+void GenXVisaRegAlloc::getLiveRanges(std::vector<LiveRange *> *LRs) const
+{
+  // create LRs for global variables.
+  for (auto &GV : FG->getModule()->globals())
+    getLiveRangesForValue(&GV, LRs);
+  for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ++ai)
+      getLiveRangesForValue(&*ai, LRs);
+    if (fgi != FG->begin() && !F->getReturnType()->isVoidTy()) {
+      // allocate reg for unified return value
+      getLiveRangesForValue(Liveness->getUnifiedRet(F), LRs);
+    }
+    for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+      BasicBlock *BB = &*fi;
+      for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi)
+        getLiveRangesForValue(&*bi, LRs);
+    }
+  }
+  for (auto &LR : *LRs)
+    LR->prepareFuncs(FGA);
+}
+
+void GenXVisaRegAlloc::getLiveRangesForValue(Value *V,
+    std::vector<LiveRange *> *LRs) const
+{
+  auto Ty = V->getType();
+  for (unsigned i = 0, e = IndexFlattener::getNumElements(Ty);
+      i != e; ++i) {
+    SimpleValue SV(V, i);
+    LiveRange *LR = Liveness->getLiveRangeOrNull(SV);
+    if (!LR || LR->getCategory() == RegCategory::NONE)
+      continue;
+    // Only process an LR if the map iterator is on the value that appears
+    // first in the LR. That avoids processing the same LR multiple times.
+    if (SV != *LR->value_begin())
+      continue;
+    LRs->push_back(LR);
+  }
+}
+
+/***********************************************************************
+ * extraCoalescing : do some extra coalescing over and above what
+ *    GenXCoalescing does
+ *
+ * GenXCoalescing does coalescing where it saves a copy, for example for
+ * a two address operand. This function does coalescing that does not save
+ * a copy, but the two live ranges are related by being the operand (a
+ * kill use) and result of the same instruction. This is in the hope that
+ * the jitter's register allocator will be able to do a better job with it.
+ *
+ * A further case of extra coalescing is that multiple instances of a constant
+ * load of a surface variable are coalesced together. This allows the CM code
+ * to use lots of printfs without running out of surface variables.
+ */
+void GenXVisaRegAlloc::extraCoalescing()
+{
+  LiveRange *CommonSurface = nullptr;
+  for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+      BasicBlock *BB = &*fi;
+      for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi) {
+        auto Inst = &*bi;
+        if (isa<StructType>(Inst->getType()))
+          continue;
+        if (GenXIntrinsic::isWrRegion(Inst))
+          continue;
+        auto LR = Liveness->getLiveRangeOrNull(Inst);
+        if (!LR || LR->Category != RegCategory::GENERAL)
+          continue;
+        // Check for convert of constant ot surface.
+        switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+          case GenXIntrinsic::genx_convert:
+          case GenXIntrinsic::genx_constanti:
+            if (LR->Category == RegCategory::SURFACE
+                && isa<Constant>(Inst->getOperand(0))) {
+              // See if we can coalesce it with CommonSurface.
+              if (!CommonSurface)
+                CommonSurface = LR;
+              else if (!Liveness->interfere(CommonSurface, LR))
+                CommonSurface = Liveness->coalesce(CommonSurface, LR, /*DisalowCASC=*/true);
+            }
+            break;
+          default:
+            break;
+        }
+        // We have a non-struct non-wrregion instruction whose result has a
+        // live range (it is not baled into anything else).
+        // Check all uses to see if there is one in a non-alu intrinsic. We
+        // don't want to coalesce that, because of the danger of the jitter
+        // needing to add an extra move in the send.
+        bool UseInNonAluIntrinsic = false;
+        for (auto ui = Inst->use_begin(), ue = Inst->use_end();
+            ui != ue && !UseInNonAluIntrinsic; ++ui) {
+          auto user = dyn_cast<Instruction>(ui->getUser());
+          assert(user);
+          if (user->getType()->isVoidTy()) {
+            UseInNonAluIntrinsic = true;
+            break;
+          }
+          unsigned IID = GenXIntrinsic::getAnyIntrinsicID(user);
+          switch (IID) {
+            case GenXIntrinsic::not_any_intrinsic:
+            case GenXIntrinsic::genx_rdregioni:
+            case GenXIntrinsic::genx_rdregionf:
+            case GenXIntrinsic::genx_wrregioni:
+            case GenXIntrinsic::genx_wrregionf:
+              break;
+            default: {
+                // It is an intrinsic. A non-alu intrinsic does not have a
+                // return value that is general.
+                GenXIntrinsicInfo II(IID);
+                if (!II.getRetInfo().isGeneral())
+                  UseInNonAluIntrinsic = true;
+              }
+              break;
+          }
+        }
+        if (UseInNonAluIntrinsic)
+          continue;
+
+        // Do not coalesce when this is a two address instrinsic with undef
+        // input. Otherwise logic is broken on lifetime marker in vISA emission.
+        //
+        auto skipTwoAddrCoalesce = [](Instruction *Inst) {
+          unsigned IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(Inst);
+          if (IntrinsicID == GenXIntrinsic::not_any_intrinsic)
+            return false;
+          GenXIntrinsicInfo Info(IntrinsicID);
+          const auto *descp = Info.getInstDesc();
+          for (const auto *p = descp; *p; ++p) {
+            GenXIntrinsicInfo::ArgInfo AI(*p);
+            if (AI.getCategory() != GenXIntrinsicInfo::TWOADDR)
+              continue;
+            if (isa<UndefValue>(Inst->getOperand(AI.getArgIdx())))
+              return true;
+          }
+          return false;
+        };
+        if (skipTwoAddrCoalesce(Inst))
+          continue;
+
+        // See if we can coalesce with any operand.
+        for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) {
+          Value *Operand = Inst->getOperand(oi);
+          if (isa<Constant>(Operand))
+            continue;
+          if (Operand->getType() != Inst->getType())
+            continue;
+          // Do not coalesce with kernel arguments as they are input variables.
+          if (FG->getHead() == F && isa<Argument>(Operand))
+            continue;
+          auto OperandLR = Liveness->getLiveRangeOrNull(Operand);
+          if (!OperandLR || OperandLR->Category != RegCategory::GENERAL)
+            continue;
+          if (Liveness->interfere(LR, OperandLR))
+            continue;
+          // The two live ranges do not interfere, so we can coalesce them.
+          if (++CoalescingCount > LimitGenXExtraCoalescing)
+            continue;
+          if (LimitGenXExtraCoalescing != UINT_MAX)
+            dbgs() << "genx extra coalescing " << CoalescingCount << "\n";
+          Liveness->coalesce(LR, OperandLR, /*DisalowCASC=*/true);
+          break;
+        }
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * allocReg : allocate a register for a LiveRange
+ */
+void GenXVisaRegAlloc::allocReg(LiveRange *LR)
+{
+  if (LR->value_empty())
+    return;
+  if (LR->getCategory() >= RegCategory::NUMREALCATEGORIES)
+    return; // don't allocate register to EM or RM value
+  LLVM_DEBUG(
+    dbgs() << "Allocating ";
+    LR->print(dbgs());
+    dbgs() << "\n"
+  );
+  SimpleValue V = *LR->value_begin();
+  Type *Ty = V.getType();
+  if (auto GV = dyn_cast<GlobalVariable>(V.getValue()))
+    if (GV->hasAttribute(genx::FunctionMD::GenXVolatile))
+      Ty = Ty->getPointerElementType();
+  assert(!Ty->isVoidTy());
+  if (LR->Category == RegCategory::PREDICATE) {
+    VectorType *VT = dyn_cast<VectorType>(Ty);
+    assert((!VT || genx::exactLog2(VT->getNumElements()) >= 0) && "invalid predicate width");
+    (void)VT;
+  }
+  // Allocate the register, also setting the alignment.
+  // Assign to the values. If any value is an input arg, ensure the register
+  // gets its type, to avoid needing an alias for an input arg.
+  for (auto &F : LR->Funcs) {
+    Reg *NewReg =
+        createReg(LR->Category, Ty, DONTCARESIGNED, LR->getLogAlignment());
+    if (RegMap.count(F) > 0) {
+      for (LiveRange::value_iterator vi = LR->value_begin(),
+                                     ve = LR->value_end();
+           vi != ve; ++vi) {
+        LLVM_DEBUG(dbgs() << "Allocating reg " << NewReg->Num << " to "
+                          << *(vi->getValue()) << " in func " << F->getName()
+                          << "\n";);
+        assert(RegMap.at(F).find(*vi) == RegMap.at(F).end());
+        RegMap.at(F)[*vi] = NewReg;
+        if (isa<Argument>(vi->getValue()))
+          NewReg->Ty = vi->getType();
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * getRegForValueUntyped : get the vISA reg allocated to a particular
+ *    value, ignoring signedness and type
+ *
+ * This is a const method so it can be called from print().
+ */
+GenXVisaRegAlloc::Reg* GenXVisaRegAlloc::getRegForValueUntyped(const Function *kernel,
+    SimpleValue V) const
+{
+  // is possible if called for GenXPrinter
+  if (RegMap.count(kernel) == 0)
+    return nullptr;
+  auto& KernMap = RegMap.at(kernel);
+  KernRegMap_t::const_iterator i = KernMap.find(V);
+  if (i == KernMap.end()) {
+    // Check if it's predefined variables.
+    if (GenXIntrinsic::getGenXIntrinsicID(V.getValue()) != GenXIntrinsic::genx_predefined_surface)
+      return nullptr;
+    auto CI = cast<CallInst>(V.getValue());
+    unsigned Id = cast<ConstantInt>(CI->getArgOperand(0))->getZExtValue();
+    assert(Id < 4 && "Invalid predefined surface ID!");
+    assert(PredefinedSurfaceRegs.size() == VISA_NUM_RESERVED_SURFACES &&
+        "Predefined surface registers have not been initialized");
+    return PredefinedSurfaceRegs[Id];
+  }
+  return i->second;
+}
+
+/***********************************************************************
+ * getRegForValueOrNull : get the vISA reg allocated to a particular Value
+ *
+ * Enter:   V = value (Argument or Instruction) to get register for
+ *          Signed = request for signed or unsigned
+ *          OverrideType = 0 else override type of value (used for bitcast)
+ *
+ * Called from GenXVisaFunctionWriter to get the register for an
+ * operand. The operand type might not match the register type (say a
+ * bitcast has been coalesced, or the same integer value is used
+ * unsigned in one place and signed in another), in which case we
+ * find/create a vISA register alias.
+ */
+GenXVisaRegAlloc::Reg* GenXVisaRegAlloc::getRegForValueOrNull(
+    const Function* kernel, SimpleValue V, Signedness Signed, Type *OverrideType)
+{
+  if (!OverrideType)
+    OverrideType = V.getType();
+  if (OverrideType->isPointerTy()) {
+    auto GV = dyn_cast<GlobalVariable>(V.getValue());
+    if (GV && GV->hasAttribute(genx::FunctionMD::GenXVolatile))
+      OverrideType = OverrideType->getPointerElementType();
+  }
+  Reg* R = getRegForValueUntyped(kernel, V);
+  if (!R)
+    return nullptr; // no register allocated
+  Reg* OriginalReg = R;
+
+  if (R->Category == RegCategory::GENERAL) {
+    for (;;) {
+      Type *ExistingType = R->Ty;
+      if (VectorType *VT = dyn_cast<VectorType>(ExistingType))
+        if (VT->getNumElements() == 1)
+          ExistingType = VT->getElementType();
+      if (VectorType *VT = dyn_cast<VectorType>(OverrideType))
+        if (VT->getNumElements() == 1)
+          OverrideType = VT->getElementType();
+      if (ExistingType == OverrideType) {
+        if (R->Signed == Signed || Signed == DONTCARESIGNED)
+          break; // Match, use this alias.
+      }
+      // On to next alias.
+      auto Next = R->NextAlias[kernel];
+      if (Next) {
+        R = Next;
+        continue;
+      }
+      // Run out of aliases. Add a new one.
+      Reg* NewReg = createReg(RegCategory::GENERAL, OverrideType, Signed, 0, OriginalReg);
+      R->NextAlias[kernel] = NewReg;
+      R = NewReg;
+      break;
+    }
+  }
+  return R;
+}
+
+/***********************************************************************
+ * getSigned : get the signedness of a register
+ *
+ * If the register has byte type and is currently don't care signedness, this
+ * arbitrarily picks unsigned. We do that because having a byte mov with
+ * different signedness between source and destination can make the jitter
+ * generate less efficient code.
+ */
+genx::Signedness GenXVisaRegAlloc::getSigned(Reg* R)
+{
+  return (R && R->Category == RegCategory::GENERAL) ?
+      R->Signed : DONTCARESIGNED;
+}
+
+// addRetIPArgument : Add the RetIP argument required for caller kernels and
+// their caller.
+void GenXVisaRegAlloc::addRetIPArgument() {
+  RetIP = createReg(RegCategory::GENERAL, Type::getInt64Ty(FG->getContext()));
+}
+
+/***********************************************************************
+ * TypeDetails constructor
+ *
+ * Enter:   Ty = LLVM type
+ *          Signedness = whether signed type required
+ */
+TypeDetails::TypeDetails(const DataLayout &DL, Type *Ty, Signedness Signed)
+    : DL(DL) {
+  Type *ElementTy = Ty;
+  NumElements = 1;
+  if (VectorType *VT = dyn_cast<VectorType>(ElementTy)) {
+    ElementTy = VT->getElementType();
+    NumElements = VT->getNumElements();
+  }
+  if (IntegerType *IT = dyn_cast<IntegerType>(ElementTy)) {
+    BytesPerElement = IT->getBitWidth() / 8;
+    if (Signed == UNSIGNED) {
+      switch (BytesPerElement) {
+        case 1: VisaType = ISA_TYPE_UB; break;
+        case 2: VisaType = ISA_TYPE_UW; break;
+        case 4: VisaType = ISA_TYPE_UD; break;
+        default: VisaType = ISA_TYPE_UQ; break;
+      }
+    } else {
+      switch (BytesPerElement) {
+        case 1: VisaType = ISA_TYPE_B; break;
+        case 2: VisaType = ISA_TYPE_W; break;
+        case 4: VisaType = ISA_TYPE_D; break;
+        default: VisaType = ISA_TYPE_Q; break;
+      }
+    }
+  } else if (ElementTy->isHalfTy()) {
+    VisaType = ISA_TYPE_HF;
+    BytesPerElement = 2;
+  } else if (ElementTy->isFloatTy()) {
+    VisaType = ISA_TYPE_F;
+    BytesPerElement = 4;
+  } else if (auto PT = dyn_cast<PointerType>(ElementTy)) {
+    BytesPerElement = DL.getPointerTypeSize(PT);
+    if (BytesPerElement == 4)
+      VisaType = ISA_TYPE_UD;
+    else if (BytesPerElement == 8)
+      VisaType = ISA_TYPE_UQ;
+    else
+      report_fatal_error("unsupported pointer type size");
+  } else {
+    assert(ElementTy->isDoubleTy());
+    VisaType = ISA_TYPE_DF;
+    BytesPerElement = 8;
+  }
+  if (NumElements > 16384 || NumElements * BytesPerElement > 16384 * 8)
+    report_fatal_error("Variable too big");
+}
+
+
+/***********************************************************************
+ * print : dump the state of the pass. This is used by -genx-dump-regalloc
+ */
+void GenXVisaRegAlloc::print(raw_ostream &OS, const Module *M) const
+{
+  // Get the live ranges in a reproducible order, and sort them by "length"
+  // (the total number of instructions that the live range covers).
+  struct LiveRangeAndLength {
+    LiveRange *LR;
+    unsigned Length;
+    LiveRangeAndLength(LiveRange *LR, unsigned Length) : LR(LR), Length(Length) {}
+    bool operator<(const LiveRangeAndLength &Rhs) const { return Length > Rhs.Length; }
+  };
+  std::vector<LiveRange *> LRs;
+  getLiveRanges(&LRs);
+  std::vector<LiveRangeAndLength> LRLs;
+  for (auto i = LRs.begin(), e = LRs.end(); i != e; ++i)
+    LRLs.push_back(LiveRangeAndLength(*i, (*i)->getLength(/*WithWeak=*/ false)));
+  LRs.clear();
+  std::sort(LRLs.begin(), LRLs.end());
+  // Dump them. Also keep count of the register pressure at each
+  // instruction number.
+  std::vector<unsigned> Pressure;
+  std::vector<unsigned> FlagPressure;
+  for (auto i = LRLs.begin(), e = LRLs.end(); i != e; ++i) {
+    // Dump a single live range.
+    LiveRange *LR = i->LR;
+    SimpleValue SV = *LR->value_begin();
+    Reg* RN = getRegForValueUntyped(&(*(M->begin())), SV);
+    assert(RN);
+    OS << "[";
+    RN->print(OS);
+    Type *ElTy = IndexFlattener::getElementType(SV.getValue()->getType(),
+          SV.getIndex());
+    unsigned Bytes = (ElTy->getPrimitiveSizeInBits() + 15U) / 8U & -2U;
+    bool IsFlag = ElTy->getScalarType()->isIntegerTy(1);
+    OS << "] (" << Bytes << " bytes, length " << i->Length <<") ";
+    // Dump some indication of what the live range is. For a kernel argument,
+    // show its name. For an instruction with debug info, show the location.
+    // We try and find the earliest definition with debug info to show.
+    unsigned BestNum = UINT_MAX;
+    Instruction *BestInst = nullptr;
+    Argument *KernelArg = nullptr;
+    for (auto i = LR->value_begin(), e = LR->value_end(); i != e; ++i) {
+      Value *V = i->getValue();
+      if (auto Arg = dyn_cast<Argument>(V)) {
+        if (Arg->getParent() == FG->getHead()) {
+          KernelArg = Arg;
+          break;
+        }
+      } else {
+        auto Inst = cast<Instruction>(V);
+        if (!isa<PHINode>(Inst)) {
+          unsigned Num = Numbering->getNumber(Inst);
+          if (Num < BestNum) {
+            auto DL = Inst->getDebugLoc();
+            if (!DL) {
+              BestNum = Num;
+              BestInst = Inst;
+            }
+          }
+        }
+      }
+    }
+    if (KernelArg)
+      OS << KernelArg->getName();
+    else if (BestInst) {
+      const DebugLoc &DL = BestInst->getDebugLoc();
+      OS << DL->getFilename() << ":" << DL.getLine();
+    }
+    // Dump the live range segments, and add each to the pressure score.
+    OS << ":";
+    LR->printSegments(OS);
+    for (auto si = LR->begin(), se = LR->end(); si != se; ++si) {
+      if (si->getEnd() >= Pressure.size()) {
+        Pressure.resize(si->getEnd() + 1, 0);
+        FlagPressure.resize(si->getEnd() + 1, 0);
+      }
+      for (unsigned n = si->getStart(); n != si->getEnd(); ++n) {
+        Pressure[n] += Bytes;
+        if (IsFlag)
+          FlagPressure[n] += Bytes;
+      }
+    }
+    OS << "\n";
+  }
+  OS << "\n";
+  // Prepare to print register pressure info. First we need to compute a
+  // mapping from instruction number to instruction. Only bother with
+  // instructions with debug info.
+  std::vector<Instruction *> Insts;
+  for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) {
+    Function *F = *fgi;
+    for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+      BasicBlock *BB = &*fi;
+      for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi) {
+        Instruction *Inst = &*bi;
+        if (!Inst->getDebugLoc()) {
+          unsigned Num = Numbering->getNumber(Inst);
+          if (Num >= Insts.size())
+            Insts.resize(Num + 1, nullptr);
+          Insts[Num] = Inst;
+        }
+      }
+    }
+  }
+  OS << "Register pressure (bytes):\n";
+  unsigned Last = 0;
+  bool HadInst = false;
+  Function *LastFunc = nullptr;
+  for (unsigned n = 0; n != Pressure.size(); ++n) {
+    if (Pressure[n]) {
+      Instruction *Inst = nullptr;
+      if (n < Insts.size())
+        Inst = Insts[n];
+      if (Pressure[n] != Last)
+        HadInst = false;
+      if (Pressure[n] != Last || (!HadInst && Inst)) {
+        if (Inst && Inst->getParent()->getParent() != LastFunc) {
+          LastFunc = Inst->getParent()->getParent();
+          OS << "In " << LastFunc->getName() << "\n";
+        }
+        Last = Pressure[n];
+        OS << Pressure[n] << " at " << n;
+        if (Inst) {
+          HadInst = true;
+          OS << " ";
+          const DebugLoc &DL = Inst->getDebugLoc();
+          DL.print(OS);
+        }
+        OS << "\n";
+      }
+    }
+  }
+  OS << "Flag pressure (bytes):\n";
+  Last = 0;
+  HadInst = false;
+  for (unsigned n = 0; n != FlagPressure.size(); ++n) {
+    Instruction *Inst = nullptr;
+    if (n < Insts.size())
+      Inst = Insts[n];
+    if (FlagPressure[n] != Last)
+      HadInst = false;
+    if (FlagPressure[n] != Last || (!HadInst && Inst)) {
+      Last = FlagPressure[n];
+      OS << FlagPressure[n] << " at " << n;
+      if (Inst) {
+        HadInst = true;
+        const DebugLoc &DL = Inst->getDebugLoc();
+        OS << " " << DL->getFilename() << ":" << DL.getLine();
+      }
+      OS << "\n";
+    }
+  }
+}
+
+/***********************************************************************
+ * RegNum::print : print a regnum
+ */
+void GenXVisaRegAlloc::Reg::print(raw_ostream &OS) const
+{
+  switch (Category) {
+    case RegCategory::NONE: OS << "-"; return;
+    case RegCategory::GENERAL: OS << "v"; break;
+    case RegCategory::ADDRESS: OS << "a"; break;
+    case RegCategory::PREDICATE: OS << "p"; break;
+    case RegCategory::SAMPLER: OS << "s"; break;
+    case RegCategory::SURFACE: OS << "t"; break;
+    case RegCategory::VME: OS << "vme"; break;
+    default: OS << "?"; break;
+  }
+  OS << Num;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.h
new file mode 100644
index 000000000000..bf89347ea5cf
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.h
@@ -0,0 +1,253 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXVisaRegAlloc
+/// ----------------
+///
+/// GenXVisaRegAlloc is a function group pass that allocates vISA registers to
+/// LLVM IR values.
+///
+/// Before allocating registers, this pass does "extra coalescing", over and above
+/// what GenXCoalescing does. Two otherwise independent live ranges that are
+/// related by being an operand and the result of the same instruction (and are
+/// the same type) get coalesced and thus allocated into the same register.
+///
+/// However, extra coalescing is not performed when the result of the instruction
+/// is used in a non-alu intrinsic, to try and avoid the danger of the jitter
+/// needing to add an extra move in the send.
+///
+/// Other than that, all this pass does is allocate a different vISA register to
+/// each LiveRange.
+///
+/// The pass is also an analysis for GenXKernelBuilder to query to find out
+/// what vISA register is allocated to a particular Value. In fact, the query
+/// from GenXKernelBuilder can specify what type it wants the register to be,
+/// and it is at that point that an alias is allocated if there is no existing
+/// alias of the requested type.
+///
+/// Finally, there are callbacks in the analysis to generate the vISA variable
+/// tables to put into the vISA file.
+///
+//===----------------------------------------------------------------------===//
+#ifndef GENXVISAREGALLOC_H
+#define GENXVISAREGALLOC_H
+
+#include "FunctionGroup.h"
+#include "GenX.h"
+#include "GenXLiveness.h"
+#include "GenXModule.h"
+#include "vc/GenXOpts/Utils/RegCategory.h"
+#include "visaBuilder_interface.h"
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+  class Function;
+  class FunctionPass;
+  class raw_ostream;
+  class Type;
+  class Value;
+
+  FunctionGroupPass *createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner);
+
+  // GenXVisaRegAlloc : vISA virtual register allocator pass
+  class GenXVisaRegAlloc : public FunctionGroupPass {
+  public:
+
+    // Reg : a virtual register
+    class Reg {
+    public:
+      unsigned short Category = genx::RegCategory::NONE;
+      // Register ID. First value of it depends on count of predefined
+      // variablse in category. F.e. for general var it is 32.
+      unsigned short Num = 0;
+      // Pointer to register that is aliased by this register.
+      Reg* AliasTo = nullptr;
+      // Single linked list to store all aliases of real register.
+      std::map<const Function*, Reg*> NextAlias;
+      genx::Signedness Signed = genx::DONTCARESIGNED;
+      Type *Ty = nullptr;
+      // log2 min alignment requested by user of register
+      unsigned Alignment;
+      // String representation of register, mostly it is combination of
+      // Category and Num
+      std::string NameStr;
+      // Attributes
+      std::vector<std::pair<unsigned, std::string>> Attributes;
+      // Pointer to VISA variable. It is set by CisaBuilder when it creates
+      // VISA variables for all registers in RegMap.
+      std::map<VISAKernel*, void*> GenVar;
+
+      explicit Reg(
+          unsigned Category,
+          unsigned Num,
+          Type *Ty = 0,
+          genx::Signedness Signed = genx::DONTCARESIGNED,
+          unsigned LogAlignment = 0,
+          Reg* AliasTo = nullptr)
+          : Category(Category), Num(Num), AliasTo(AliasTo), Signed(Signed),
+            Ty(Ty), Alignment(LogAlignment) {
+        static const char* Prefix[] = { "ERR", "V", "A", "P", "S", "T", "VME" };
+        assert(Category && Category < genx::RegCategory::NUMREALCATEGORIES);
+        NameStr = Prefix[Category] + std::to_string(Num);
+      }
+
+      // Get VISA variable assigned to register.
+      // Template T is just cast for return Type. Normally, we need to assert
+      // here that required Var type is equal of real type in GenVar.
+      template<class T>
+      T* GetVar(VISAKernel* F) {
+        return reinterpret_cast<T*>(GenVar[F]);
+      }
+
+      // Set VISA variable for current register.
+      void SetVar(VISAKernel *F, void* Var) {
+        GenVar[F] = Var;
+      }
+
+      void addAttribute(unsigned AttrName, Twine AttrVal) {
+        Attributes.push_back(std::make_pair(AttrName, AttrVal.str()));
+      }
+
+      void print(raw_ostream &OS) const;
+    };
+
+    using RegPushHook = void(*)(void* Object, Reg&);
+    using KernRegMap_t = std::map<genx::SimpleValue, Reg*>;
+    using RegMap_t = std::map<const Function*, KernRegMap_t>;
+  private:
+    FunctionGroup *FG;
+    GenXLiveness *Liveness;
+    GenXNumbering *Numbering;
+    FunctionGroupAnalysis *FGA;
+
+    // pushReg callback that will be called once new register is created
+    RegPushHook TheRegPushHook = nullptr;
+    // Object that will be passed to hook, likely it is 'this' of hook owner.
+    void* TheRegPushHookObject = nullptr;
+
+    // Storage for all created registers. It is list because we use pointers
+    // to stored registers, thus we need to non-reallocable storage.
+    std::list<Reg> RegStorage;
+    // Map from LLVM Value to pointer to register associed with it.
+    RegMap_t RegMap;
+    // List of pointers to predefined surface registers.
+    std::vector<Reg*> PredefinedSurfaceRegs;
+
+    // Array of current indexes being assigned to new register.
+    unsigned CurrentRegId[genx::RegCategory::NUMREALCATEGORIES];
+
+  public:
+    static char ID;
+    explicit GenXVisaRegAlloc() : FunctionGroupPass(ID) { }
+    virtual StringRef getPassName() const { return "GenX vISA virtual register allocator"; }
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+    bool runOnFunctionGroup(FunctionGroup &FG);
+
+    std::list<Reg>& getRegStorage() {
+      return RegStorage;
+    }
+    // Get the vISA virtual register for a value (assert if none)
+    Reg* getRegForValue(const Function *kernel, genx::SimpleValue V,
+        genx::Signedness Signed = genx::DONTCARESIGNED, Type *OverrideType = 0)
+    {
+      Reg* R = getRegForValueOrNull(kernel, V, Signed, OverrideType);
+      assert(R && "no register allocated for this value");
+      return R;
+    }
+    // Get the vISA virtual register for a value or nullptr if there is no
+    // register associated with given value.
+    Reg* getRegForValueOrNull(const Function *kernel, genx::SimpleValue V,
+      genx::Signedness Signed = genx::DONTCARESIGNED, Type *OverrideType = 0);
+
+    // Get the vISA virtual register for a value (0 if none), ignoring type
+    // and signedness so it can be a const method usable from print().
+    Reg* getRegForValueUntyped(const Function* kernel, genx::SimpleValue V) const;
+
+    // Get the signedness of a register.
+    genx::Signedness getSigned(Reg* R);
+
+    // Set callback that will be called each time new register is created.
+    // It is used in CisaBuilder when new aliases are created.
+    void SetRegPushHook(void* Object, RegPushHook Callback) {
+      TheRegPushHook = Callback;
+      TheRegPushHookObject = Object;
+    }
+
+    // Create new register and push it in storage.
+    // If RegPushHook was specified it will be called with created register as
+    // parameter. Thus, all needed register's variables must be specified
+    // at this moment, for example AliasTo.
+    template<class ... Args>
+    Reg* createReg(unsigned Category, Args&& ... args) {
+      RegStorage.emplace_back(Category, CurrentRegId[Category]++,
+        std::forward<Args>(args) ...);
+      Reg& R = RegStorage.back();
+      if (TheRegPushHook)
+        TheRegPushHook(TheRegPushHookObject, R);
+      return &R;
+    }
+
+    // createPrinterPass : get a pass to print the IR, together with the GenX
+    // specific analyses
+    virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const
+    { return createGenXGroupPrinterPass(O, Banner); }
+    // print : dump the state of the pass. This is used by -genx-dump-regalloc
+    virtual void print(raw_ostream &O, const Module *M) const;
+  private:
+    Type *BoolTy;
+    void getLiveRanges(std::vector<genx::LiveRange *> *LRs) const;
+    void getLiveRangesForValue(Value *V, std::vector<genx::LiveRange *> *LRs) const;
+    void extraCoalescing();
+    void allocReg(genx::LiveRange *LR);
+  public:
+    // Add special RetIP argument.
+    Reg* getRetIPArgument() const { return RetIP; }
+    void addRetIPArgument();
+  private:
+    unsigned CoalescingCount = 0;
+    Reg* RetIP;
+  };
+
+  namespace visa {
+    // Details of a type required for a vISA general register declaration
+    // or an indirect operand.
+    struct TypeDetails {
+      const DataLayout &DL;
+      unsigned NumElements;
+      unsigned BytesPerElement;
+      unsigned VisaType;
+      TypeDetails(const DataLayout &DL, Type *Ty, genx::Signedness Signed);
+    };
+  } // end namespace visa
+
+  void initializeGenXVisaRegAllocPass(PassRegistry &);
+
+} // end namespace llvm
+#endif //ndef GENXVISAREGALLOC_H
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.cpp
new file mode 100644
index 000000000000..bda27d3d1859
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.cpp
@@ -0,0 +1,34 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "GenXWATable.h"
+
+using namespace llvm;
+
+char GenXWATable::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GenXWATable, "GenXWATable", "GenXWATable", false, true)
+INITIALIZE_PASS_END(GenXWATable, "GenXWATable", "GenXWATable", false, true)
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.h
new file mode 100644
index 000000000000..4274db163e0e
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.h
@@ -0,0 +1,57 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef VCOPT_LIB_GENXCODEGEN_GENXWATABLE_H
+#define VCOPT_LIB_GENXCODEGEN_GENXWATABLE_H
+
+#include <llvm/Pass.h>
+
+#include <inc/common/sku_wa.h>
+
+namespace llvm {
+
+void initializeGenXWATablePass(PassRegistry &PR);
+
+// Transparent wrapper around driver WA_TABLE.
+class GenXWATable : public ImmutablePass {
+  WA_TABLE *WaTable = nullptr;
+
+public:
+  static char ID;
+
+  GenXWATable() : ImmutablePass(ID) {}
+
+  GenXWATable(WA_TABLE *Table) : ImmutablePass(ID), WaTable{Table} {
+    initializeGenXWATablePass(*PassRegistry::getPassRegistry());
+  }
+
+  // This can return nullptr which means that we don't know
+  // workarounds for current platform.
+  WA_TABLE *getWATable() const { return WaTable; }
+};
+} // namespace llvm
+
+#endif
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXWrapper.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWrapper.cpp
new file mode 100644
index 000000000000..d80fb2907f2e
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWrapper.cpp
@@ -0,0 +1,717 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#if defined(__linux__)
+#include <dlfcn.h>
+#endif
+
+#include "GenXOCLRuntimeInfo.h"
+#include "GenXWATable.h"
+
+#include "llvmWrapper/Target/TargetMachine.h"
+
+#include "vc/GenXCodeGen/GenXTarget.h"
+#include "vc/GenXCodeGen/GenXWrapper.h"
+#include "vc/GenXOpts/GenXOpts.h"
+#include "vc/GenXOpts/Utils/KernelInfo.h"
+#include "vc/Support/Options.h"
+#include "vc/Support/Status.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/GenXIntrinsics/GenXSPIRVReaderAdaptor.h"
+#include "llvm/GenXIntrinsics/GenXIntrOpts.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include <cctype>
+#include <memory>
+#include <new>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+static Expected<std::vector<char>> translateSPIRVToIR(ArrayRef<char> Input) {
+#if defined(_WIN64)
+ //TODO: rename to SPIRVDLL64.dll when binary components are fixed
+  constexpr char *SpirvLibName = "SPIRVDLL.dll";
+#elif defined(_WIN32)
+  constexpr char *SpirvLibName = "SPIRVDLL32.dll";
+#else
+  constexpr char *SpirvLibName = "libSPIRVDLL.so";
+#endif
+  constexpr char *SpirvReadVerifyName = "spirv_read_verify_module";
+  using SpirvReadVerifyType =
+      int(const char *pIn, size_t InSz,
+          void (*OutSaver)(const char *pOut, size_t OutSize, void *OutUserData),
+          void *OutUserData,
+          void (*ErrSaver)(const char *pErrMsg, void *ErrUserData),
+          void *ErrUserData);
+
+#if defined(__linux__)
+  // Hack to workaround cmoc crashes during loading of SPIRV library
+  static auto DeepBindHack = dlopen(SpirvLibName, RTLD_NOW | RTLD_DEEPBIND);
+#endif // __linux__
+
+  using DL = sys::DynamicLibrary;
+  std::string ErrMsg;
+  DL DyLib = DL::getPermanentLibrary(SpirvLibName, &ErrMsg);
+  if (!DyLib.isValid())
+    return make_error<vc::DynLoadError>(ErrMsg);
+
+  auto *SpirvReadVerifyFunc = reinterpret_cast<SpirvReadVerifyType *>(
+      DyLib.getAddressOfSymbol(SpirvReadVerifyName));
+  if (!SpirvReadVerifyFunc)
+    return make_error<vc::SymbolLookupError>(SpirvLibName, SpirvReadVerifyName);
+
+  auto OutSaver = [](const char *pOut, size_t OutSize, void *OutData) {
+    auto *Vec = reinterpret_cast<std::vector<char> *>(OutData);
+    Vec->assign(pOut, pOut + OutSize);
+  };
+  auto ErrSaver = [](const char *pErrMsg, void *ErrData) {
+    auto *ErrStr = reinterpret_cast<std::string *>(ErrData);
+    *ErrStr = pErrMsg;
+  };
+
+  std::vector<char> Result;
+  int Status = SpirvReadVerifyFunc(Input.data(), Input.size(), OutSaver,
+                                   &Result, ErrSaver, &ErrMsg);
+  if (Status != 0)
+    return make_error<vc::BadSpirvError>(ErrMsg);
+
+  return {std::move(Result)};
+}
+
+static Expected<std::unique_ptr<llvm::Module>> getModule(ArrayRef<char> Input,
+                                                         LLVMContext &C) {
+  auto ExpIR = translateSPIRVToIR(Input);
+  if (!ExpIR)
+    return ExpIR.takeError();
+
+  std::vector<char> &IR = ExpIR.get();
+  llvm::MemoryBufferRef BufferRef(llvm::StringRef(IR.data(), IR.size()),
+                                  "Deserialized SPIRV Module");
+  auto ExpModule = llvm::parseBitcodeFile(BufferRef, C);
+
+  if (!ExpModule)
+    return llvm::handleExpected(
+        std::move(ExpModule),
+        []() -> llvm::Error {
+          llvm_unreachable("Should create new error");
+          // Without this dead return MSVC fails with ICE in release-32bit.
+          return llvm::Error::success();
+        },
+        [](const llvm::ErrorInfoBase &E) {
+          return make_error<vc::BadBitcodeError>(E.message());
+        });
+
+  if (verifyModule(*ExpModule.get()))
+    return make_error<vc::InvalidModuleError>();
+
+  return ExpModule;
+}
+
+static void dumpModuleToTemp(const Module &M, const char *Name) {
+  int FD = -1;
+  auto EC = sys::fs::openFileForWrite(
+        Name, FD, sys::fs::CD_CreateAlways, sys::fs::F_None);
+  if (EC) {
+    llvm::errs() << "Can not open file: " << Name << "\n";
+    return;
+  }
+
+  raw_fd_ostream O(FD, /*shouldClose=*/true);
+  M.print(O, nullptr);
+}
+
+static void dumpDataToTemp(StringRef S, const char *Name) {
+  int FD = -1;
+  auto EC = sys::fs::openFileForWrite(
+        Name, FD, sys::fs::CD_CreateAlways, sys::fs::F_None);
+  if (EC) {
+    llvm::errs() << "Can not open file: " << Name << "\n";
+    return;
+  }
+
+  raw_fd_ostream O(FD, /*shouldClose=*/true);
+  O << S;
+}
+
+static vc::ocl::ArgInfo
+convertOCLArgInfo(const GenXOCLRuntimeInfo::KernelArgInfo &Info) {
+  vc::ocl::ArgInfo Converted;
+
+  using ArgKind = GenXOCLRuntimeInfo::KernelArgInfo::KindType;
+  switch (Info.getKind()) {
+  case ArgKind::General:
+    Converted.Kind = vc::ocl::ArgKind::General;
+    break;
+  case ArgKind::LocalSize:
+    Converted.Kind = vc::ocl::ArgKind::LocalSize;
+    break;
+  case ArgKind::GroupCount:
+    Converted.Kind = vc::ocl::ArgKind::GroupCount;
+    break;
+  case ArgKind::Buffer:
+    Converted.Kind = vc::ocl::ArgKind::Buffer;
+    break;
+  case ArgKind::SVM:
+    Converted.Kind = vc::ocl::ArgKind::SVM;
+    break;
+  case ArgKind::Sampler:
+    Converted.Kind = vc::ocl::ArgKind::Sampler;
+    break;
+  case ArgKind::Image1D:
+    Converted.Kind = vc::ocl::ArgKind::Image1d;
+    break;
+  case ArgKind::Image2D:
+    Converted.Kind = vc::ocl::ArgKind::Image2d;
+    break;
+  case ArgKind::Image3D:
+    Converted.Kind = vc::ocl::ArgKind::Image3d;
+    break;
+  case ArgKind::PrintBuffer:
+    Converted.Kind = vc::ocl::ArgKind::PrintBuffer;
+    break;
+  case ArgKind::PrivateBase:
+    Converted.Kind = vc::ocl::ArgKind::PrivateBase;
+    break;
+  }
+
+  using ArgAccessKind = GenXOCLRuntimeInfo::KernelArgInfo::AccessKindType;
+  switch (Info.getAccessKind()) {
+  case ArgAccessKind::None:
+    Converted.AccessKind = vc::ocl::ArgAccessKind::None;
+    break;
+  case ArgAccessKind::ReadOnly:
+    Converted.AccessKind = vc::ocl::ArgAccessKind::ReadOnly;
+    break;
+  case ArgAccessKind::WriteOnly:
+    Converted.AccessKind = vc::ocl::ArgAccessKind::WriteOnly;
+    break;
+  case ArgAccessKind::ReadWrite:
+    Converted.AccessKind = vc::ocl::ArgAccessKind::ReadWrite;
+    break;
+  }
+
+  Converted.Index = Info.getIndex();
+  Converted.Offset = Info.getOffset();
+  Converted.SizeInBytes = Info.getSizeInBytes();
+  Converted.BTI = Info.getBTI();
+
+  return Converted;
+}
+
+static void convertOCLKernelInfo(vc::ocl::KernelInfo &Converted,
+                                 const GenXOCLRuntimeInfo::KernelInfo &Info) {
+  Converted.Name = Info.getName();
+  std::transform(Info.arg_begin(), Info.arg_end(),
+                 std::back_inserter(Converted.Args),
+                 [](const GenXOCLRuntimeInfo::KernelArgInfo &ArgInfo) {
+                   return convertOCLArgInfo(ArgInfo);
+                 });
+  Converted.PrintStrings = Info.getPrintStrings();
+  Converted.HasGroupID = Info.usesGroupId();
+  Converted.HasBarriers = Info.usesBarriers();
+  Converted.SLMSize = Info.getSLMSize();
+  Converted.ThreadPrivateMemSize = Info.getTPMSize();
+  Converted.StatelessPrivateMemSize = Info.getStatelessPrivMemSize();
+  Converted.GRFSizeInBytes = Info.getGRFSizeInBytes();
+
+  if (Info.getRelocationTable().Size > 0) {
+    Converted.RelocationTable.Buf = Info.getRelocationTable().Buffer;
+    Converted.RelocationTable.Size = Info.getRelocationTable().Size;
+    Converted.RelocationTable.NumEntries =
+        Info.getRelocationTable().Entries;
+  }
+  if (Info.getSymbolTable().Size > 0) {
+    Converted.SymbolTable.Buf = Info.getSymbolTable().Buffer;
+    Converted.SymbolTable.Size = Info.getSymbolTable().Size;
+    Converted.SymbolTable.NumEntries = Info.getSymbolTable().Entries;
+  }
+}
+
+
+static std::vector<vc::ocl::CompileInfo> convertInternalOCLInfo(
+    const std::vector<GenXOCLRuntimeInfo::CompiledKernel> &CompiledKernels) {
+  std::vector<vc::ocl::CompileInfo> Converted{CompiledKernels.size()};
+  for (unsigned i = 0, e = CompiledKernels.size(); i != e; ++i) {
+    auto &Conv = Converted[i];
+    auto &Orig = CompiledKernels[i];
+    convertOCLKernelInfo(Conv.KernelInfo, Orig.getKernelInfo());
+    Conv.JitInfo = Orig.getJitterInfo();
+    Conv.GenBinary = Orig.getGenBinary();
+  }
+  return Converted;
+}
+
+static Triple overrideTripleWithVC(StringRef TripleStr) {
+  Triple T{TripleStr};
+  // Normalize triple.
+  bool Is32Bit = T.isArch32Bit();
+  if (TripleStr.startswith("genx32"))
+    Is32Bit = true;
+  return Triple{Is32Bit ? "genx32-unknown-unknown" : "genx64-unknown-unknown"};
+}
+
+static std::string getSubtargetFeatureString(const vc::CompileOptions &Opts) {
+  SubtargetFeatures Features;
+  if (Opts.NoVecDecomp)
+    Features.AddFeature("disable_vec_decomp");
+  if (Opts.Runtime == vc::RuntimeKind::OpenCL)
+    Features.AddFeature("ocl_runtime");
+  return Features.getString();
+}
+
+static CodeGenOpt::Level getCodeGenOptLevel(const vc::CompileOptions &Opts) {
+  if (Opts.OptLevel == vc::OptimizerLevel::None)
+    return CodeGenOpt::None;
+  return CodeGenOpt::Default;
+}
+
+static Expected<std::unique_ptr<TargetMachine>>
+createTargetMachine(const vc::CompileOptions &Opts, Triple &TheTriple) {
+  std::string Error;
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(TheTriple.getArchName(), TheTriple, Error);
+  assert(TheTarget && "vc target was not registered");
+
+  const std::string FeaturesStr = getSubtargetFeatureString(Opts);
+  // These ones do not look useful for now. Maybe will be adjusted
+  // later to account for fp model.
+  const TargetOptions Options;
+  CodeGenOpt::Level OptLevel = getCodeGenOptLevel(Opts);
+  std::unique_ptr<TargetMachine> TM{
+      TheTarget->createTargetMachine(TheTriple.getTriple(), Opts.CPUStr,
+                                     FeaturesStr, Options, /*RelocModel=*/None,
+                                     /*CodeModel=*/None, OptLevel)};
+  if (!TM)
+    return make_error<vc::TargetMachineError>();
+  return {std::move(TM)};
+}
+
+static void optimizeIR(const vc::CompileOptions &Opts, TargetMachine &TM,
+                       Module &M) {
+  legacy::PassManager PerModulePasses;
+  legacy::FunctionPassManager PerFunctionPasses(&M);
+
+  PerModulePasses.add(
+      createTargetTransformInfoWrapperPass(TM.getTargetIRAnalysis()));
+  PerFunctionPasses.add(
+      createTargetTransformInfoWrapperPass(TM.getTargetIRAnalysis()));
+
+  unsigned OptLevel;
+  if (Opts.OptLevel == vc::OptimizerLevel::None)
+    OptLevel = 0;
+  else
+    OptLevel = 2;
+
+  PassManagerBuilder PMBuilder;
+  PMBuilder.Inliner = createFunctionInliningPass(2, 2, false);
+  PMBuilder.OptLevel = OptLevel;
+  PMBuilder.SizeLevel = OptLevel;
+  PMBuilder.SLPVectorize = false;
+  PMBuilder.LoopVectorize = false;
+  PMBuilder.DisableUnrollLoops = false;
+  PMBuilder.MergeFunctions = false;
+  PMBuilder.PrepareForThinLTO = false;
+  PMBuilder.PrepareForLTO = false;
+  PMBuilder.RerollLoops = true;
+
+  TM.adjustPassManager(PMBuilder);
+
+  PMBuilder.populateFunctionPassManager(PerFunctionPasses);
+  PMBuilder.populateModulePassManager(PerModulePasses);
+
+  // Do we need per function passes at all?
+  PerFunctionPasses.doInitialization();
+  for (Function &F : M) {
+    if (!F.isDeclaration())
+      PerFunctionPasses.run(F);
+  }
+  PerFunctionPasses.doFinalization();
+
+  PerModulePasses.run(M);
+}
+
+static void dumpFinalOutput(const vc::CompileOptions &Opts, const Module &M,
+                            StringRef IsaBinary) {
+  if (Opts.DumpIR)
+    dumpModuleToTemp(M, "final.ll");
+  if (Opts.DumpIsa)
+    dumpDataToTemp(IsaBinary, "final.isa");
+}
+
+static void populateCodeGenPassManager(const vc::CompileOptions &Opts,
+                                       TargetMachine &TM, raw_pwrite_stream &OS,
+                                       legacy::PassManager &PM) {
+  TargetLibraryInfoImpl TLII{TM.getTargetTriple()};
+  PM.add(new TargetLibraryInfoWrapperPass(TLII));
+  // Non-constant pointer.
+  WA_TABLE *WaTable = Opts.WATable.get();
+  PM.add(new GenXWATable(WaTable));
+
+  auto FileType = IGCLLVM::TargetMachine::CodeGenFileType::CGFT_AssemblyFile;
+  bool AddPasses =
+      TM.addPassesToEmitFile(PM, OS, nullptr, FileType, /*NoVerify*/ true);
+  assert(!AddPasses && "Bad filetype for vc-codegen");
+}
+
+static vc::ocl::CompileOutput runOclCodeGen(const vc::CompileOptions &Opts,
+                                            TargetMachine &TM, Module &M) {
+  legacy::PassManager PM;
+
+  SmallString<32> IsaBinary;
+  raw_svector_ostream OS(IsaBinary);
+  raw_null_ostream NullOS;
+  if (Opts.DumpIsa)
+    populateCodeGenPassManager(Opts, TM, OS, PM);
+  else
+    populateCodeGenPassManager(Opts, TM, NullOS, PM);
+
+  std::vector<GenXOCLRuntimeInfo::CompiledKernel> CompiledKernels;
+  PM.add(createGenXOCLInfoExtractorPass(CompiledKernels));
+
+  PM.run(M);
+  dumpFinalOutput(Opts, M, IsaBinary);
+
+  vc::ocl::CompileOutput Output;
+  Output.Kernels = convertInternalOCLInfo(CompiledKernels);
+  Output.PointerSizeInBytes = M.getDataLayout().getPointerSize();
+  return Output;
+}
+
+static vc::cm::CompileOutput runCmCodeGen(const vc::CompileOptions &Opts,
+                                          TargetMachine &TM, Module &M) {
+  legacy::PassManager PM;
+  SmallString<32> IsaBinary;
+  raw_svector_ostream OS(IsaBinary);
+  populateCodeGenPassManager(Opts, TM, OS, PM);
+  PM.run(M);
+  dumpFinalOutput(Opts, M, IsaBinary);
+  vc::cm::CompileOutput Output;
+  Output.IsaBinary.assign(IsaBinary.begin(), IsaBinary.end());
+  return Output;
+}
+
+static vc::CompileOutput runCodeGen(const vc::CompileOptions &Opts,
+                                    TargetMachine &TM, Module &M) {
+  switch (Opts.Runtime) {
+  case vc::RuntimeKind::CM:
+    return runCmCodeGen(Opts, TM, M);
+  case vc::RuntimeKind::OpenCL:
+    return runOclCodeGen(Opts, TM, M);
+  }
+  llvm_unreachable("Unknown runtime kind");
+}
+
+Expected<vc::CompileOutput> vc::Compile(ArrayRef<char> Input,
+                                        const vc::CompileOptions &Opts) {
+  // Environment variable for additional options for debug purposes.
+  // This will exit with error if options is incorrect and should not
+  // be used to pass meaningful options required for compilation.
+#ifndef NDEBUG
+  constexpr const char *DebugEnvVarName = "IGC_VCCodeGenDebugOpts";
+  cl::ParseEnvironmentOptions("vc-codegen", DebugEnvVarName);
+#endif
+
+  LLVMContext Context;
+  LLVMInitializeGenXTarget();
+  LLVMInitializeGenXTargetInfo();
+  llvm::PassRegistry &Registry = *llvm::PassRegistry::getPassRegistry();
+  llvm::initializeTarget(Registry);
+
+  auto ExpModule = getModule(Input, Context);
+  if (!ExpModule)
+    return ExpModule.takeError();
+  Module &M = *ExpModule.get();
+
+  legacy::PassManager PerModulePasses;
+  PerModulePasses.add(createGenXSPIRVReaderAdaptorPass());
+  PerModulePasses.add(createGenXRestoreIntrAttrPass());
+  PerModulePasses.run(M);
+
+  Triple TheTriple = overrideTripleWithVC(M.getTargetTriple());
+  M.setTargetTriple(TheTriple.getTriple());
+
+  auto ExpTargetMachine = createTargetMachine(Opts, TheTriple);
+  if (!ExpTargetMachine)
+    return ExpTargetMachine.takeError();
+  TargetMachine &TM = *ExpTargetMachine.get();
+  M.setDataLayout(TM.createDataLayout());
+
+  if (Opts.DumpIR)
+    dumpModuleToTemp(M, "start.ll");
+
+  optimizeIR(Opts, TM, M);
+
+  if (Opts.DumpIR)
+    dumpModuleToTemp(M, "optimized.ll");
+
+  return runCodeGen(Opts, TM, M);
+}
+
+static Expected<opt::InputArgList>
+parseOptions(const SmallVectorImpl<const char *> &Argv,
+             vc::options::Flags FlagsToInclude) {
+  const opt::OptTable &Options = vc::getOptTable();
+
+  const bool IsInternal = FlagsToInclude == vc::options::InternalOption;
+
+  unsigned MissingArgIndex = 0;
+  unsigned MissingArgCount = 0;
+  opt::InputArgList InputArgs =
+      Options.ParseArgs(Argv, MissingArgIndex, MissingArgCount, FlagsToInclude);
+  if (MissingArgCount)
+    return make_error<vc::OptionError>(Argv[MissingArgIndex], IsInternal);
+
+  // ocloc uncoditionally passes opencl options to internal options.
+  // Skip checking of internal options for now.
+  if (!IsInternal) {
+    if (opt::Arg *A = InputArgs.getLastArg(vc::options::OPT_UNKNOWN,
+                                           vc::options::OPT_INPUT)) {
+      std::string BadOpt = A->getAsString(InputArgs);
+      return make_error<vc::OptionError>(BadOpt, IsInternal);
+    }
+  }
+
+  return {std::move(InputArgs)};
+}
+
+static Expected<opt::InputArgList> parseApiOptions(StringSaver &Saver,
+                                                   StringRef ApiOptions) {
+  SmallVector<const char *, 8> Argv;
+  cl::TokenizeGNUCommandLine(ApiOptions, Saver, Argv);
+
+  const opt::OptTable &Options = vc::getOptTable();
+  // This can be rewritten to parse options and then check for
+  // OPT_vc_codegen, but it would be better to manually check for
+  // this option before any real parsing. If it is missing,
+  // then no parsing should be done at all.
+  auto HasOption = [&Argv](const std::string &Opt) {
+    return std::any_of(Argv.begin(), Argv.end(),
+                       [&Opt](const char *ArgStr) { return Opt == ArgStr; });
+  };
+  const std::string VCCodeGenOptName =
+      Options.getOption(vc::options::OPT_vc_codegen).getPrefixedName();
+  if (HasOption(VCCodeGenOptName))
+    return parseOptions(Argv, vc::options::ApiOption);
+  // Deprecated -cmc parsing just for compatibility.
+  const std::string IgcmcOptName =
+      Options.getOption(vc::options::OPT_igcmc).getPrefixedName();
+  if (!sys::Process::GetEnv("IGC_VCAvoidCmcFlag") && HasOption(IgcmcOptName))
+    return parseOptions(Argv, vc::options::IgcmcApiOption);
+
+  return make_error<vc::NotVCError>();
+}
+
+static Expected<opt::InputArgList>
+parseInternalOptions(StringSaver &Saver, StringRef InternalOptions) {
+  SmallVector<const char *, 8> Argv;
+  cl::TokenizeGNUCommandLine(InternalOptions, Saver, Argv);
+  return parseOptions(Argv, vc::options::InternalOption);
+}
+
+static Error fillApiOptions(const opt::ArgList &ApiOptions,
+                            vc::CompileOptions &Opts) {
+  if (ApiOptions.hasArg(vc::options::OPT_igcmc))
+    Opts.OptLevel = vc::OptimizerLevel::None;
+  if (ApiOptions.hasArg(vc::options::OPT_no_vector_decomposition))
+    Opts.NoVecDecomp = true;
+
+  if (opt::Arg *A = ApiOptions.getLastArg(vc::options::OPT_optimize)) {
+    StringRef Val = A->getValue();
+    auto MaybeLevel = StringSwitch<Optional<vc::OptimizerLevel>>(Val)
+                          .Case("none", vc::OptimizerLevel::None)
+                          .Case("full", vc::OptimizerLevel::Full)
+                          .Default(None);
+    if (!MaybeLevel) {
+      const std::string BadOpt = A->getAsString(ApiOptions);
+      return make_error<vc::OptionError>(BadOpt, /*IsInternal=*/false);
+    }
+    Opts.OptLevel = MaybeLevel.getValue();
+  }
+
+  return Error::success();
+}
+
+static Error fillInternalOptions(const opt::ArgList &InternalOptions,
+                                 vc::CompileOptions &Opts) {
+  if (InternalOptions.hasArg(vc::options::OPT_dump_isa_binary))
+    Opts.DumpIsa = true;
+  if (InternalOptions.hasArg(vc::options::OPT_dump_llvm_ir))
+    Opts.DumpIR = true;
+
+  if (opt::Arg *A = InternalOptions.getLastArg(vc::options::OPT_runtime)) {
+    StringRef Val = A->getValue();
+    auto MaybeRuntime = StringSwitch<Optional<vc::RuntimeKind>>(Val)
+                            .Case("cm", vc::RuntimeKind::CM)
+                            .Case("ocl", vc::RuntimeKind::OpenCL)
+                            .Default(None);
+    if (!MaybeRuntime) {
+      const std::string BadOpt = A->getAsString(InternalOptions);
+      return make_error<vc::OptionError>(BadOpt, /*IsInternal=*/true);
+    }
+    Opts.Runtime = MaybeRuntime.getValue();
+  }
+
+  if (InternalOptions.hasArg(vc::options::OPT_help)) {
+    constexpr const char *Usage = "-options \"-vc-codegen [options]\"";
+    constexpr const char *Title = "Vector compiler options";
+    constexpr unsigned FlagsToInclude = vc::options::ApiOption;
+    constexpr unsigned FlagsToExclude = 0;
+    constexpr bool ShowAllAliases = false;
+    vc::getOptTable().PrintHelp(llvm::errs(), Usage, Title, FlagsToInclude,
+                                FlagsToExclude, ShowAllAliases);
+  }
+  if (InternalOptions.hasArg(vc::options::OPT_help_internal)) {
+    constexpr const char *Usage =
+        "-options \"-vc-codegen\" -internal_options \"[options]\"";
+    constexpr const char *Title = "Vector compiler internal options";
+    constexpr unsigned FlagsToInclude = vc::options::InternalOption;
+    constexpr unsigned FlagsToExclude = 0;
+    constexpr bool ShowAllAliases = false;
+    vc::getOptTable().PrintHelp(llvm::errs(), Usage, Title, FlagsToInclude,
+                                FlagsToExclude, ShowAllAliases);
+  }
+
+  return Error::success();
+}
+
+static Expected<vc::CompileOptions>
+fillOptions(const opt::ArgList &ApiOptions,
+            const opt::ArgList &InternalOptions) {
+  vc::CompileOptions Opts;
+  Error Status = fillApiOptions(ApiOptions, Opts);
+  if (Status)
+    return {std::move(Status)};
+
+  Status = fillInternalOptions(InternalOptions, Opts);
+  if (Status)
+    return {std::move(Status)};
+
+  return {std::move(Opts)};
+}
+
+// Parse global llvm cl options.
+// Parsing of cl codegen options should not fail under any circumstances.
+static void parseLLVMOptions(const opt::ArgList &Args) {
+  // Need to control cl options as vector compiler still uses these ones
+  // to control compilation process. This will be addressed later.
+  llvm::cl::ResetAllOptionOccurrences();
+  BumpPtrAllocator Alloc;
+  StringSaver Saver{Alloc};
+  SmallVector<const char *, 8> Argv{"vc-codegen"};
+  for (const std::string &ArgPart :
+       Args.getAllArgValues(vc::options::OPT_llvm_options))
+    cl::TokenizeGNUCommandLine(ArgPart, Saver, Argv);
+  cl::ParseCommandLineOptions(Argv.size(), Argv.data());
+}
+
+// Derive llvm options from different API and internal options.
+static opt::DerivedArgList
+composeLLVMArgs(const opt::InputArgList &ApiArgs,
+                const opt::InputArgList &InternalArgs,
+                llvm::StringSaver &Saver) {
+  const opt::OptTable &Options = vc::getOptTable();
+  const opt::Option LLVMOpt = Options.getOption(vc::options::OPT_llvm_options);
+
+  // Pass through old value.
+  opt::DerivedArgList UpdatedArgs{InternalArgs};
+  if (const opt::Arg *BaseArg =
+          InternalArgs.getLastArg(vc::options::OPT_llvm_options))
+    UpdatedArgs.AddSeparateArg(BaseArg, LLVMOpt, BaseArg->getValue());
+
+  // Add visaopts if any.
+  if (opt::Arg *VisaArg = ApiArgs.getLastArg(vc::options::OPT_igcmc_visaopts)) {
+    StringRef WrappedVisaOpts =
+        Saver.save(Twine{"-finalizer-opts='"} + VisaArg->getValue() + "'");
+    UpdatedArgs.AddSeparateArg(VisaArg, LLVMOpt, WrappedVisaOpts);
+  }
+
+
+  // Stack memory size.
+  if (opt::Arg *StackMemSizeArg =
+          ApiArgs.getLastArg(vc::options::OPT_igcmc_stack_size)) {
+    StringRef MemSizeRef = Saver.save(StackMemSizeArg->getAsString(ApiArgs));
+    UpdatedArgs.AddSeparateArg(StackMemSizeArg, LLVMOpt, MemSizeRef);
+  }
+
+
+  return UpdatedArgs;
+}
+
+llvm::Expected<vc::CompileOptions>
+vc::ParseOptions(llvm::StringRef ApiOptions, llvm::StringRef InternalOptions) {
+  llvm::BumpPtrAllocator Alloc;
+  llvm::StringSaver Saver{Alloc};
+  auto ExpApiArgList = parseApiOptions(Saver, ApiOptions);
+  if (!ExpApiArgList)
+    return ExpApiArgList.takeError();
+  const opt::InputArgList &ApiArgs = ExpApiArgList.get();
+
+  auto ExpInternalArgList = parseInternalOptions(Saver, InternalOptions);
+  if (!ExpInternalArgList)
+    return ExpInternalArgList.takeError();
+  const opt::InputArgList &InternalArgs = ExpInternalArgList.get();
+
+  // Prepare additional llvm options (like finalizer args).
+  opt::DerivedArgList LLVMArgs = composeLLVMArgs(ApiArgs, InternalArgs, Saver);
+
+  // This is a temporary solution until we remove all cl options that
+  // are accesible by user and affect compilation.
+  parseLLVMOptions(LLVMArgs);
+
+  return fillOptions(ApiArgs, InternalArgs);
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/IgnoreRAUWValueMap.h b/IGC/VectorCompiler/lib/GenXCodeGen/IgnoreRAUWValueMap.h
new file mode 100644
index 000000000000..0fb00d7050b6
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/IgnoreRAUWValueMap.h
@@ -0,0 +1,42 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef IGNORERAUWVALUEMAP_H
+#define IGNORERAUWVALUEMAP_H
+#include "llvm/IR/ValueMap.h"
+
+namespace llvm {
+
+// Configuration for ValueMap that ignores RAUW, instead of moving the map
+// entry.
+template<typename ValueTy>
+struct IgnoreRAUWValueMapConfig : public ValueMapConfig<ValueTy> {
+  enum { FollowRAUW = false };
+};
+
+} // End llvm namespace
+
+#endif // ndef IGNORERAUWVALUEMAP_H
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/IsaDescription.h b/IGC/VectorCompiler/lib/GenXCodeGen/IsaDescription.h
new file mode 100644
index 000000000000..86f5b0ec18ba
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/IsaDescription.h
@@ -0,0 +1,254 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+#pragma once
+#include <stdint.h>
+
+///
+/// ISA Description
+///
+
+#define TYPE_INTEGER ISA_TYPE_UW|ISA_TYPE_W|ISA_TYPE_UB|ISA_TYPE_B|ISA_TYPE_D|ISA_TYPE_UD|ISA_TYPE_Q|ISA_TYPE_UQ
+#define TYPE_FLOAT   ISA_TYPE_DF|ISA_TYPE_F
+#define TYPE_FLOAT_ALL   ISA_TYPE_DF|ISA_TYPE_F|ISA_TYPE_HF
+#define TYPE_ANY TYPE_INTEGER | TYPE_FLOAT
+
+#define SIZEOF_CISA_OPCODE sizeof(unsigned char)
+#define OPND_DST_GEN       0x100
+#define OPND_SRC_GEN       0x200
+#define OPND_DST_INDIR     0x400
+#define OPND_SRC_INDIR     0x800
+#define OPND_DST_PRED      0x1000
+#define OPND_SRC_PRED      0x2000
+#define OPND_DST_ADDR      0x4000
+#define OPND_SRC_ADDR      0x8000
+#define OPND_ADDRESS_OF    0x10000
+#define OPND_SURFACE       0x20000
+#define OPND_SAMPLE        0x40000
+#define OPND_IMM           0x100000
+#define OPND_PRED          0x200000
+#define OPND_OTHER         0x400000
+#define OPND_RAW_SRC       0x800000
+#define OPND_RAW_DST       0x1000000
+
+#define OPND_VECTOR_SRC_G_IMM_AO     OPND_SRC_GEN | OPND_IMM | OPND_ADDRESS_OF
+#define OPND_VECTOR_SRC_G_I_IMM_AO   OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR | OPND_ADDRESS_OF
+#define OPND_VECTOR_SRC_G_I_IMM      OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR
+#define OPND_VECTOR_SRC_G_I_IMM_A_AO OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR | OPND_SRC_ADDR | OPND_ADDRESS_OF
+#define OPND_VECTOR_SRC_G_I_IMM_P_AO OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR | OPND_SRC_PRED | OPND_ADDRESS_OF
+#define OPND_VECTOR_SRC_G_A_AO       OPND_SRC_GEN | OPND_SRC_ADDR | OPND_ADDRESS_OF
+#define OPND_VECTOR_SRC_G_I          OPND_SRC_GEN | OPND_SRC_INDIR
+
+#define OPND_VECTOR_DST_G_I          OPND_DST_GEN | OPND_DST_INDIR
+#define OPND_VECTOR_DST_G_I_A        OPND_DST_GEN | OPND_DST_INDIR | OPND_DST_ADDR
+#define OPND_VECTOR_DST_G_I_P        OPND_DST_GEN | OPND_DST_PRED | OPND_DST_INDIR
+
+#define OPND_VECTOR_SRC              OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR | OPND_SRC_ADDR | OPND_ADDRESS_OF | OPND_SRC_PRED
+#define OPND_VECTOR_DST              OPND_DST_GEN | OPND_DST_INDIR | OPND_DST_ADDR | OPND_DST_PRED
+
+#define OPND_SPECIAL                 OPND_SAMPLE | OPND_SURFACE
+
+#define SAME_DATA_TYPE    0x1
+#define SAME_SPECIAL_KIND 0x2
+
+#define OPND_BLOCK_WIDTH  OPND_IMM
+#define OPND_BLOCK_HEIGHT OPND_IMM
+#define OPND_PLANE        OPND_IMM
+
+#define OPND_SIMB_INDEX   OPND_IMM
+#define OPND_NUM_OPNDS    OPND_IMM
+#define OPND_KIND         OPND_IMM
+
+typedef enum {
+    SIZE_1 = 1,
+    SIZE_2 = 2,
+    SIZE_4 = 4,
+    SIZE_8 = 8
+} SpecificSize;
+
+typedef enum {
+    HORIZON_STRIDE_1 = 1,
+    HORIZON_VERTICAL_STRIDE_0,
+    HORIZON_STRIDE_2,
+    ELEM_NUM_2,
+    ELEM_NUM_4,
+    ELEM_NUM_8_16,
+    ELEM_NUM_96,
+    ELEM_NUM_128,
+    ELEM_NUM_224,
+    ELEM_NUM_GE_2,
+    ELEM_NUM_GE_16,
+    ELEM_NUM_GE_32,
+    ELEM_NUM_GE_128,
+    ELEM_NUM_GE_160,
+    ELEM_NUM_MC32,
+    ELEM_NUM_MC16,
+    SIZE_54,
+    SIZE_128,
+    SIZE_192,
+    SIZE_224,
+    SIZE_228,
+    SIZE_352,
+    SIZE_SIZE,
+    OWORD_SIZE,
+    GE_4,
+    VALUE_0_3,
+    VALUE_1_32,
+    VALUE_1_64,
+    SINGLE_DATA_TYPE,
+    PREDICATE_NONEPRED_OPND,
+    SCALAR_REGION,
+    LABEL_BLOCK_C,
+    LABEL_FUNC_C,
+    SIZE_GE_WIDTH_M_HIEGH,
+    GE_READSIZE,
+    GE_WRITESIZE,
+    SIZE_STREAM_MODE_DEPENDENT_1,
+    SIZE_STREAM_MODE_DEPENDENT_2,
+    SIZE_STREAM_MODE_DEPENDENT_3,
+    SIZE_STREAM_MODE_DEPENDENT_4,
+    LENGHT_LESS_256,
+    GRF_ALIGNED = 0x100,
+    SAT_C = 0x200,
+    SAT_FLOAT_ONLY = 0x400
+
+    //GATHER: UPPER_BITS_IGNORE,
+    // LINENUM: LARGE_THAN_0,
+    //SIZE_BLOCK_HEIGH_WIDTH,
+    //OWORD_LD_UNALIGNED: SIZE_SIZE_OWORD,
+    //Instruction specific features
+    //RIGHT_ALIGNED,
+    //MOVS:  SINGLE_SPEC_OPND_TYPE,
+    //FILE NAME: LENGHT_LESS_256,
+    //ALL:  WITHIN_SIMD_WIDTH
+} OpndContraint;
+
+//Common_ISA_Opnd_Desc_Type
+enum {
+     OPND_EXECSIZE = 1,
+     OPND_STRING,
+     OPND_LABEL,
+     OPND_ATOMIC_SUBOP,
+     OPND_EMASK_CTRL,
+     OPND_COND_MODE,
+     OPND_CHAN_PATT,
+     OPND_OWORD_SIZE,
+     OPND_IS_MODIFIED,
+     OPND_ELEM_NUM,
+     OPND_ELEM_SIZE,
+     OPND_SIMD_MODE,
+     OPND_CHANNEL_SIMD_MODE,
+     OPND_CMP_SUBOP,
+     OPND_VME_SUBOP,
+     OPND_STREAM_MODE,
+     OPND_SEARCH_CRTL,
+     OPND_MATRIX_MODE,
+     OPND_SUBMATRIX_SHAPE,
+     OPND_SUBPRE_SHAPE,
+     OPND_SPECIAL_KIND,
+     OPND_MEDIA_LD_MODIFIER,
+     OPND_MEDIA_ST_MODIFIER,
+     OPND_RAW,
+     OPND_SUBOPCODE,
+     OP_EXT
+};
+
+typedef enum
+{
+    ISA_Inst_Mov       = 0x0,
+    ISA_Inst_Arith     = 0x1,
+    ISA_Inst_Logic     = 0x2,
+    ISA_Inst_Compare   = 0x3, //CMP
+    ISA_Inst_Address   = 0x4, //ADDROF, ADDR_ADD
+    ISA_Inst_Flow      = 0x5,
+    ISA_Inst_Data_Port = 0x6,
+    ISA_Inst_Sampler   = 0x7,
+    ISA_Inst_Misc      = 0x8, // VME, etc.
+    ISA_Inst_SIMD_Flow = 0x9,
+    ISA_Inst_Sync      = 0xA,
+    ISA_Inst_SVM       = 0xB,
+    ISA_Inst_Reserved
+} ISA_Inst_Type;
+
+struct ISA_Inst_Info
+{
+    ISA_Opcode    op;
+    ISA_Inst_Type type;
+    const char*   str;
+    uint8_t       n_srcs;  //for send messages, we count the surface as well as all the offsets to be sources
+    uint8_t       n_dsts;
+};
+
+#define MAX_OPNDS_PER_INST 24
+
+typedef struct OpndDesc
+{
+    unsigned opnd_type; //Common_ISA_Opnd_Desc_Type OR #defines like OPND_VECTOR_SRC_G_IMM_AO
+    unsigned data_type; //VISA_Type, overloaded to supported data types if it's a vector
+    unsigned opnd_constraint;
+} OpndDesc;
+
+
+typedef uint8_t ISA_SubOpcode;
+
+struct ISA_SubInst_Desc
+{
+    ISA_SubOpcode  subOpcode;
+    ISA_Inst_Type  type;
+    const char*    name;
+    uint16_t       opnd_num;
+    OpndDesc       opnd_desc[MAX_OPNDS_PER_INST];
+};
+
+struct VISA_INST_Desc
+{
+    TARGET_PLATFORM  platf;
+    ISA_SubOpcode    opcode;
+    ISA_Inst_Type    type;
+    const char*      name;
+    uint16_t         opnd_num;
+    char             attr;
+    OpndDesc         opnd_desc[MAX_OPNDS_PER_INST];
+
+    const ISA_SubInst_Desc& getSubInstDesc(uint8_t subOpcode) const;
+    const ISA_SubInst_Desc& getSubInstDescByName(const char *symbol) const;
+};
+
+enum SVMSubOpcode
+{
+    SVM_BLOCK_LD = 0x1,
+    SVM_BLOCK_ST = 0x2,
+    SVM_GATHER   = 0x3,
+    SVM_SCATTER  = 0x4,
+    SVM_ATOMIC   = 0x5,
+    SVM_GATHER4SCALED,
+    SVM_SCATTER4SCALED,
+    SVM_LASTOP
+};
+
+
+extern struct ISA_Inst_Info ISA_Inst_Table[ISA_OPCODE_ENUM_SIZE];
+
+extern VISA_INST_Desc CISA_INST_table[ISA_NUM_OPCODE];
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.cpp
new file mode 100644
index 000000000000..4472f78fec38
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.cpp
@@ -0,0 +1,188 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// KillAnalysis is an object that can analyze which uses of a value are kills,
+// and cache the result.
+//
+//===----------------------------------------------------------------------===//
+
+#include "KillAnalysis.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+
+namespace {
+
+// BlockInfo : info for one basic block when calculating the live range for
+// one value
+struct BlockInfo {
+  llvm::Instruction *LastUser;
+  bool LiveOut;
+  BlockInfo() : LastUser(nullptr), LiveOut(false) {}
+};
+
+} // anonymous namespace
+
+using namespace llvm;
+
+/***********************************************************************
+ * isKill : determine whether a use is a kill
+ *
+ * Enter:   U = the use, which must be of an Instruction or Argument
+ *
+ * Return:  true if this is a kill use (including the case that there are
+ *          multiple uses in the same instruction, and no further reachable
+ *          uses)
+ *
+ * This caches the information on which uses of the value are kills. If
+ * anything changes to do with the value, such as changing uses or moving
+ * code containing uses, or even completely removing the value, then the
+ * caller must invalidate the cached information by calling erase(V).
+ */
+bool KillAnalysis::isKill(Use *U)
+{
+  SmallVectorImpl<Instruction *> *Kills = getKills(*U);
+  for (unsigned i = 0, e = Kills->size(); i != e; ++i)
+    if ((*Kills)[i] == U->getUser())
+      return true;
+  return false;
+}
+
+/***********************************************************************
+ * getKills : get the kills vector for the value
+ *
+ * If there is no kills vector already cached for this value, we need to
+ * create one by determining its live range and remembering which is the
+ * last user in each basic block. Where a use is seen in a basic block,
+ * we recursively add its predecessor blocks to the live range, stopping
+ * when we get to an already seen block.
+ *
+ * This is pretty much the same as the algorithm in
+ * Appel "Modern Compiler Implementation in C" 19.6.
+ *
+ */
+SmallVectorImpl<Instruction *> *KillAnalysis::getKills(Value *V)
+{
+  auto MapIter = Map.find(V);
+  if (MapIter != Map.end())
+    return &MapIter->second;
+  // Need to construct live range for this value so we can find the kill uses.
+  std::map<BasicBlock *, BlockInfo> Blocks;
+  // If the value is an instruction, set up the def as the last user in its
+  // basic block. Don't do anything for an argument.
+  if (auto Inst = dyn_cast<Instruction>(V))
+    Blocks[Inst->getParent()].LastUser = Inst;
+  // Trace back from each use.
+  for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) {
+    auto user = cast<Instruction>(ui->getUser());
+    if (auto Phi = dyn_cast<PHINode>(user)) {
+      // Use in a phi node. Just mark the incoming block as live out.
+      Blocks[Phi->getIncomingBlock(ui->getOperandNo())].LiveOut = true;
+      continue;
+    }
+    auto BB = user->getParent();
+    auto BI = &Blocks[BB];
+    if (BI->LiveOut)
+      continue; // already live out of this block
+    if (BI->LastUser == V) {
+      // This is the first time we have seen a use in this block, and it is
+      // the def block. It is tentatively the last user in the block, and
+      // no tracing back is required.
+      BI->LastUser = user;
+      continue;
+    }
+    bool LiveIn = false;
+    if (!BI->LastUser) {
+      // This is the first time we have seen a use in this block. It is
+      // tentatively the last user in the block.
+      BI->LastUser = user;
+      LiveIn = true;
+    } else if (BI->LastUser != user) {
+      // There was already a tentative last use in this block (in a different
+      // instruction to the present use). We need to see which one comes last.
+      // To attempt to optimize the case that the two uses are fairly close
+      // together in a large basic block, we walk both forwards and backwards
+      // at the same time.
+      auto Backwards = BI->LastUser;
+      auto Forwards = BI->LastUser;
+      for (;;) {
+        if (Backwards != &BB->front()) {
+          Backwards = Backwards->getPrevNode();
+          if (Backwards == user) {
+            // user is not the last user
+            break;
+          }
+        }
+        if (Forwards != &BB->back()) {
+          Forwards = Forwards->getNextNode();
+          if (Forwards == user) {
+            // user is the last user.
+            BI->LastUser = user;
+            break;
+          }
+        }
+      }
+    }
+    if (!LiveIn)
+      continue;
+    // We now need to trace back through predecessors.
+    SmallVector<BasicBlock *, 4> Stack;
+    for (;;) {
+      if (BB) {
+        // Push predecessors onto stack.
+        for (auto bui = BB->use_begin(), bue = BB->use_end(); bui != bue; ++bui) {
+	      Instruction *Inst = cast<Instruction>(bui->getUser());
+          assert(Inst && Inst->isTerminator() && "cannot cope with computed goto");
+          Stack.push_back(Inst->getParent());
+        }
+      }
+      // Get a predecessor from the stack.
+      if (Stack.empty())
+        break;
+      BB = Stack.back();
+      Stack.resize(Stack.size() - 1);
+      // Mark it live out. If it is already live out, or we have already seen a
+      // use there, we do not need to trace back.
+      BI = &Blocks[BB];
+      if (BI->LiveOut || BI->LastUser)
+        BB = nullptr;
+      BI->LiveOut = true;
+    }
+  }
+  // Create a new entry in the map for this value, and populate it with the
+  // kill uses.
+  // Note that the order in which we populate the kill uses vector depends on
+  // memory layout, so if anything starts to depend on it, we should change
+  // this code to use a fixed ordering.
+  auto MapEntry = &Map[V];
+  for (auto i = Blocks.begin(), e = Blocks.end(); i != e; ++i)
+    if (!i->second.LiveOut && i->second.LastUser)
+      MapEntry->push_back(i->second.LastUser);
+  return MapEntry;
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.h b/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.h
new file mode 100644
index 000000000000..269b2d45605f
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.h
@@ -0,0 +1,51 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// KillAnalysis is an object that can analyze which uses of a value are kills,
+// and cache the result.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/ValueMap.h"
+
+namespace llvm {
+
+class Use;
+class Value;
+
+class KillAnalysis {
+  ValueMap<const Value *, SmallVector<Instruction *, 2>> Map;
+public:
+  // erase : erase a value from the KillAnalysis
+  void erase(Value *V) { Map.erase(V); }
+  // isKill : determine whether a use is a kill
+  bool isKill(Use *U);
+private:
+  SmallVectorImpl<Instruction *> *getKills(Value *V);
+};
+
+} // namespace llvm
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/CMakeLists.txt b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/CMakeLists.txt
new file mode 100644
index 000000000000..21f184ee94ac
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(INFO_SOURCES
+  GenXTargetInfo.cpp
+)
+
+add_library(VCTargetInfo ${INFO_SOURCES})
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.cpp
new file mode 100644
index 000000000000..e4b3b53f8cc3
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.cpp
@@ -0,0 +1,50 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "GenXTargetInfo.h"
+
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target &llvm::getTheGenXTarget32() {
+  static Target TheGenXTarget32;
+  return TheGenXTarget32;
+}
+
+Target &llvm::getTheGenXTarget64() {
+  static Target TheGenXTarget64;
+  return TheGenXTarget64;
+}
+
+extern "C" void LLVMInitializeGenXTargetInfo() {
+  RegisterTarget<> X(getTheGenXTarget32(), "genx32", "Intel GenX 32-bit",
+                     "genx32");
+  RegisterTarget<> Y(getTheGenXTarget64(), "genx64", "Intel GenX 64-bit",
+                     "genx64");
+}
+
+extern "C" void LLVMInitializeGenXTargetMC() {}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.h b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.h
new file mode 100644
index 000000000000..205dfce2fc1c
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.h
@@ -0,0 +1,39 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#ifndef LIB_GENXCODEGEN_TARGETINFO_GENXTARGETINFO_H
+#define LIB_GENXCODEGEN_TARGETINFO_GENXTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheGenXTarget32();
+Target &getTheGenXTarget64();
+
+}
+
+#endif
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/Utils/CMakeLists.txt b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/CMakeLists.txt
new file mode 100644
index 000000000000..aadf0397f982
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/CMakeLists.txt
@@ -0,0 +1,23 @@
+
+set(CISA_GEN_INTRINSICS "${CMAKE_CURRENT_SOURCE_DIR}/cisa_gen_intrinsics.py")
+set(CISA_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/cisa_gen_intrinsics.json")
+
+set(CISA_OUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/../")
+set(CISA_OUT_FILES "${CISA_OUT_PATH}/GenXIntrinsicInfoTable.inc"
+                   "${CISA_OUT_PATH}/GenXIntrinsicsBuildMap.inc")
+message(" >>${CISA_OUT_PATH}<<  ->  ${CMAKE_CURRENT_BINARY_DIR}")
+message(" COMMAND ->  ${PYTHON_EXECUTABLE} ${CISA_GEN_INTRINSICS} ${CISA_JSON_FILE} ${CISA_OUT_PATH} <-")
+message(" ${CMAKE_CURRENT_SOURCE_DIR}")
+add_custom_command(
+    OUTPUT ${CISA_OUT_FILES}
+    COMMAND ${PYTHON_EXECUTABLE} ${CISA_GEN_INTRINSICS} ${CISA_JSON_FILE} ${CISA_OUT_PATH}
+    COMMENT "Building Cisa generators for GenXCisaBuilder."
+    DEPENDS ${CISA_GEN_INTRINSICS} ${CISA_JSON_FILE}
+    VERBATIM)
+
+set_source_files_properties(
+    ${CISA_OUT_FILES}
+    PROPERTIES GENERATED TRUE
+    )
+
+add_custom_target(GenXUtilBuild ALL DEPENDS ${CISA_OUT_FILES})
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.json b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.json
new file mode 100755
index 000000000000..d77e3e7a2e25
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.json
@@ -0,0 +1,3674 @@
+{
+  "DESCRIPTION": "See cisa_gen_intrinsics.py for description of this document",
+  "INTRINSICS": {
+    "genx_fptosi_sat": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ]
+    },
+    "genx_fptoui_sat": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ]
+    },
+    "genx_sat": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ]
+    },
+    "genx_uutrunc_sat": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ]
+    },
+    "genx_ustrunc_sat": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ]
+    },
+    "genx_sutrunc_sat": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ]
+    },
+    "genx_sstrunc_sat": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ]
+    },
+    "genx_thread_x": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_X, 0, 0, 0, 1, 0)"
+    },
+    "genx_thread_y": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_Y, 0, 0, 0, 1, 0)"
+    },
+    "genx_group_id_x": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_GROUP_ID_X, 0, 0, 0, 1, 0)"
+    },
+    "genx_group_id_y": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_GROUP_ID_Y, 0, 0, 0, 1, 0)"
+    },
+    "genx_group_id_z": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_GROUP_ID_Z, 0, 0, 0, 1, 0)"
+    },
+    "genx_timestamp": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_TSC, 0, 0, 1, 1, 0)"
+    },
+    "genx_r0": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_R0, 0, 0, 1, 1, 0)"
+    },
+    "genx_ce0": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE_NOMASK" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_CE0, 0, 0, 0, 1, 0)"
+    },
+    "genx_sr0": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_SR0, 0, 0, 1, 1, 0)"
+    },
+    "genx_set_sr0_2": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE_NOMASK" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": "CreateOpndPredefinedDst(PREDEFINED_SR0, 0, 2, 1)",
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_get_color": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_COLOR, 0, 0, 1, 1, 0)"
+    },
+    "genx_get_hwid": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": "CreateOpndPredefinedSrc(PREDEFINED_HW_TID, 0, 0, 0, 1, 0)"
+    },
+    "genx_set_pause": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": "CreateOpndPredefinedDst(PREDEFINED_TSC, 0, 4, 1)",
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_dummy_mov": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": "CreateOpndPredefinedDst(PREDEFINED_NULL, 0, 0, 1)",
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_constanti": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_constantf": {
+      "opc": "ISA_MOV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_media_ld": {
+      "opc": "ISA_MEDIA_LD",
+      "modifiers": [ "BYTE", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "plane": [ "BYTE", 3 ],
+      "block_width": [ "BYTE", 4 ],
+      "block_height": [ "MEDIAHEIGHT", 4 ],
+      "x_offset": [ "GENERAL", "UNSIGNED", 5 ],
+      "y_offset": [ "GENERAL", "UNSIGNED", 6 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_media_st": {
+      "opc": "ISA_MEDIA_ST",
+      "modifiers": [ "BYTE", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "plane": [ "BYTE", 3 ],
+      "block_width": [ "BYTE", 4 ],
+      "block_height": [ "MEDIAHEIGHT", 4 ],
+      "x_offset": [ "GENERAL", "UNSIGNED", 5 ],
+      "y_offset": [ "GENERAL", "UNSIGNED", 6 ],
+      "src": [ "RAW", 7 ]
+    },
+    "genx_oword_ld": {
+      "opc": "ISA_OWORD_LD",
+      "log2_owords": [ "LOG2OWORDS", 0 ],
+      "is_modified": [ "BYTE", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "GENERAL", "UNSIGNED", 3 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_oword_ld_unaligned": {
+      "opc": "ISA_OWORD_LD_UNALIGNED",
+      "gen_opc": "ISA_OWORD_LD",
+      "log2_owords": [ "LOG2OWORDS", 0 ],
+      "is_modified": [ "BYTE", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "GENERAL", "UNSIGNED", 3 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_oword_st": {
+      "opc": "ISA_OWORD_ST",
+      "log2_owords": [ "LOG2OWORDS", 3 ],
+      "surface": [ "SURFACE", 1 ],
+      "offset": [ "GENERAL", "UNSIGNED", 2 ],
+      "src": [ "RAW", 3 ]
+    },
+    "genx_dword_atomic_add": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_ADD" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "URAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_sub": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_SUB" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "URAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_inc": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_INC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "NULLRAW" ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_dec": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_DEC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "NULLRAW" ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_min": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_MIN" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "URAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_max": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_MAX" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "URAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_xchg": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_XCHG" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "URAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_cmpxchg": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_CMPXCHG" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "URAW", 4 ],
+      "src1": [ "URAW", 5 ],
+      "twoaddr": [ "TWOADDR", 6 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_and": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_AND" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "URAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_or": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_OR" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "URAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_xor": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_XOR" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "URAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_imin": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_IMIN" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "SRAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "SRAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_imax": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_IMAX" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "SRAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "SRAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_fmax": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_FMAX" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "RAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "RAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_fmin": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_FMIN" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "RAW", 4 ],
+      "src1": [ "NULLRAW" ],
+      "twoaddr": [ "TWOADDR", 5 ],
+      "dst": [ "RAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_dword_atomic_fcmpwr": {
+      "opc": "ISA_DWORD_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_FCMPWR" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "offset": [ "URAW", 3 ],
+      "src": [ "RAW", 4 ],
+      "src1": [ "RAW", 5 ],
+      "twoaddr": [ "TWOADDR", 6 ],
+      "dst": [ "RAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "fma": {
+      "opc": "ISA_MAD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "MODIFIER_ARITH", 2 ],
+      "src2": [ "GENERAL", "MODIFIER_ARITH", 3 ]
+    },
+    "genx_ssmad": {
+      "opc": "ISA_MAD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ],
+      "src2": [ "GENERAL", "SIGNED", "CONTIGUOUS", 3 ]
+    },
+    "genx_sumad": {
+      "opc": "ISA_MAD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ],
+      "src2": [ "GENERAL", "UNSIGNED", "CONTIGUOUS", 3 ]
+    },
+    "genx_usmad": {
+      "opc": "ISA_MAD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ],
+      "src2": [ "GENERAL", "SIGNED", "CONTIGUOUS", 3 ]
+    },
+    "genx_uumad": {
+      "opc": "ISA_MAD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ],
+      "src2": [ "GENERAL", "UNSIGNED", "CONTIGUOUS", 3 ]
+    },
+    "genx_ssmad_sat": {
+      "opc": "ISA_MAD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ],
+      "src2": [ "GENERAL", "SIGNED", "CONTIGUOUS", 3 ]
+    },
+    "genx_sumad_sat": {
+      "opc": "ISA_MAD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ],
+      "src2": [ "GENERAL", "UNSIGNED", "CONTIGUOUS", 3 ]
+    },
+    "genx_usmad_sat": {
+      "opc": "ISA_MAD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ],
+      "src2": [ "GENERAL", "SIGNED", "CONTIGUOUS", 3 ]
+    },
+    "genx_uumad_sat": {
+      "opc": "ISA_MAD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ],
+      "src2": [ "GENERAL", "UNSIGNED", "CONTIGUOUS", 3 ]
+    },
+    "genx_constantpred": {
+      "opc": "ISA_SETP",
+      "exec_size": [ "EXECSIZE" ],
+      "dst": [ "PREDICATE", 0 ],
+      "src0": [ "CONSTVI1ASI32", 1 ]
+    },
+    "genx_smax": {
+      "opc": "ISA_FMINMAX",
+      "exec_size": [ "EXECSIZE" ],
+      "flag_for_max": [ "LITERAL", 1 ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_INTALLOWED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_umax": {
+      "opc": "ISA_FMINMAX",
+      "exec_size": [ "EXECSIZE" ],
+      "flag_for_max": [ "LITERAL", 1 ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_INTALLOWED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_fmax": {
+      "opc": "ISA_FMINMAX",
+      "exec_size": [ "EXECSIZE" ],
+      "flag_for_max": [ "LITERAL", 1 ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_smin": {
+      "opc": "ISA_FMINMAX",
+      "exec_size": [ "EXECSIZE" ],
+      "flag_for_max": [ "LITERAL", 0 ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_INTALLOWED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_umin": {
+      "opc": "ISA_FMINMAX",
+      "exec_size": [ "EXECSIZE" ],
+      "flag_for_max": [ "LITERAL", 0 ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_INTALLOWED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_fmin": {
+      "opc": "ISA_FMINMAX",
+      "exec_size": [ "EXECSIZE" ],
+      "flag_for_max": [ "LITERAL", 0 ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_pow": {
+      "opc": "ISA_POW",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ],
+      "src1": [ "GENERAL", 2 ]
+    },
+    "genx_add_addr": {
+      "opc": "ISA_ADDR_ADD",
+      "exec_size": [ "EXECSIZE" ],
+      "dst": [ "ADDRESS", 0 ],
+      "src0": [ "ADDRESS", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", 2 ]
+    },
+    "genx_3d_sample": {
+      "opc": "ISA_3D_SAMPLE",
+      "sampling3d_opcode": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 2 ],
+      "pred": [ "PREDICATION", 2 ],
+      "channel_mask": [ "BYTE", 3 ],
+      "aoffimmi_value": [ "GENERAL", "UNSIGNED", 4 ],
+      "sampler": [ "SAMPLER", 5 ],
+      "surface": [ "SURFACE", 6 ],
+      "dst": [ "RAW", 0 ],
+      "number_of_additional_operands": [ "ARGCOUNT", "ARGCOUNTMIN1", 7 ],
+      "raw_operands": [ "RAW_OPERANDS", "RAW", 7 ]
+    },
+    "genx_sqrt": {
+      "opc": "ISA_SQRT",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_rsqrt": {
+      "opc": "ISA_RSQRT",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_ieee_sqrt": {
+      "opc": "ISA_SQRTM",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_inv": {
+      "opc": "ISA_INV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_log": {
+      "opc": "ISA_LOG",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_exp": {
+      "opc": "ISA_EXP",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_scatter_scaled": {
+      "opc": "ISA_SCATTER_SCALED",
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "1_byte_block_size_MBZ": [ "LITERAL", 0 ],
+      "log2_num_blocks": [ "BYTE", 2 ],
+      "scale": [ "SHORT", 3 ],
+      "surface": [ "SURFACE", 4 ],
+      "global_offset": [ "GENERAL", "UNSIGNED", 5 ],
+      "element_offset": [ "URAW", 6 ],
+      "src": [ "RAW", 7 ]
+    },
+    "genx_scatter4_scaled": {
+      "opc": "ISA_SCATTER4_SCALED",
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "channel_mask": [ "BYTE", 2 ],
+      "scale": [ "SHORT", 3 ],
+      "surface": [ "SURFACE", 4 ],
+      "global_offset": [ "GENERAL", "UNSIGNED", 5 ],
+      "element_offset": [ "URAW", 6 ],
+      "src": [ "RAW", 7 ]
+    },
+    "genx_scatter4_typed": {
+      "opc": "ISA_SCATTER4_TYPED",
+      "exec_size": [ "EXECSIZE_FROM_ARG", 2 ],
+      "pred": [ "PREDICATION", 2 ],
+      "channel_mask": [ "BYTE", 1 ],
+      "surface": [ "SURFACE", 3 ],
+      "U_pixel_address": [ "URAW", 4 ],
+      "V_pixel_address": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R_pixel_address": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "NULLRAW" ],
+      "src": [ "RAW", 7 ]
+    },
+    "genx_gather_scaled": {
+      "opc": "ISA_GATHER_SCALED",
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "block_size_MBZ": [ "LITERAL", 0 ],
+      "log2_num_blocks": [ "BYTE", 2 ],
+      "scale": [ "SHORT", 3 ],
+      "surface": [ "SURFACE", 4 ],
+      "global_offset": [ "GENERAL", "UNSIGNED", 5 ],
+      "element_offset": [ "URAW", 6 ],
+      "skip__": [ "TWOADDR", 7 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_gather_scaled2": {
+      "opc": "ISA_GATHER_SCALED",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "block_size_MBZ": [ "LITERAL", 0 ],
+      "log2_num_blocks": [ "BYTE", 1 ],
+      "scale": [ "SHORT", 2 ],
+      "surface": [ "SURFACE", 3 ],
+      "global_offset": [ "GENERAL", "UNSIGNED", 4 ],
+      "element_offset": [ "URAW", 5 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_gather4_scaled": {
+      "opc": "ISA_GATHER4_SCALED",
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "channel_mask": [ "BYTE", 2 ],
+      "scale": [ "SHORT", 3 ],
+      "surface": [ "SURFACE", 4 ],
+      "global_offset": [ "GENERAL", "UNSIGNED", 5 ],
+      "element_offset": [ "URAW", 6 ],
+      "skip__": [ "TWOADDR", 7 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_gather4_scaled2": {
+      "opc": "ISA_GATHER4_SCALED",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "channel_mask": [ "BYTE", 1 ],
+      "scale": [ "SHORT", 2 ],
+      "surface": [ "SURFACE", 3 ],
+      "global_offset": [ "GENERAL", "UNSIGNED", 4 ],
+      "element_offset": [ "URAW", 5 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_gather4_typed": {
+      "opc": "ISA_GATHER4_TYPED",
+      "exec_size": [ "EXECSIZE_FROM_ARG", 2 ],
+      "pred": [ "PREDICATION", 2 ],
+      "channel_mask": [ "BYTE", 1 ],
+      "surface": [ "SURFACE", 3 ],
+      "U_pixel_address": [ "URAW", 4 ],
+      "V_pixel_address": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R_pixel_address": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 7 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_typed_atomic_add": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_ADD" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_sub": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_SUB" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_inc": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_INC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 3 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 4 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "src0": [ "NULLRAW" ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_dec": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_DEC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 3 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 4 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "src0": [ "NULLRAW" ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_min": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_MIN" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_max": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_MAX" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_xchg": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_XCHG" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_cmpxchg": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_CMPXCHG" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 5 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 8 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "URAW", 4 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_and": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_AND" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_or": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_OR" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_xor": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_XOR" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_imin": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_IMIN" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_imax": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_IMAX" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_fmax": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_FMAX" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_fmin": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_FMIN" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 4 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 5 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_typed_atomic_fcmpwr": {
+      "opc": "ISA_3D_TYPED_ATOMIC",
+      "sub_opc": [ "LITERAL", "ATOMIC_FCMPWR" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U": [ "URAW", 5 ],
+      "V": [ "URAW", "RAW_NULLALLOWED", 6 ],
+      "R": [ "URAW", "RAW_NULLALLOWED", 7 ],
+      "LOD": [ "URAW", "RAW_NULLALLOWED", 8 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "URAW", 4 ],
+      "dst": [ "URAW", "RAW_NULLALLOWED", 0 ]
+    },
+    "genx_sssad2add": {
+      "opc": "ISA_SAD2ADD",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", 1 ],
+      "src1": [ "GENERAL", "SIGNED", 2 ],
+      "src2": [ "GENERAL", "SIGNED", 3 ]
+    },
+    "genx_uusad2add": {
+      "opc": "ISA_SAD2ADD",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", 2 ],
+      "src2": [ "GENERAL", "UNSIGNED", 3 ]
+    },
+    "genx_susad2add": {
+      "opc": "ISA_SAD2ADD",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", 2 ],
+      "src2": [ "GENERAL", "SIGNED", 3 ]
+    },
+    "genx_ussad2add": {
+      "opc": "ISA_SAD2ADD",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", 1 ],
+      "src1": [ "GENERAL", "SIGNED", 2 ],
+      "src2": [ "GENERAL", "UNSIGNED", 3 ]
+    },
+    "genx_sssad2add_sat": {
+      "opc": "ISA_SAD2ADD",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", 1 ],
+      "src1": [ "GENERAL", "SIGNED", 2 ],
+      "src2": [ "GENERAL", "SIGNED", 3 ]
+    },
+    "genx_uusad2add_sat": {
+      "opc": "ISA_SAD2ADD",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", 2 ],
+      "src2": [ "GENERAL", "UNSIGNED", 3 ]
+    },
+    "genx_susad2add_sat": {
+      "opc": "ISA_SAD2ADD",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", 2 ],
+      "src2": [ "GENERAL", "SIGNED", 3 ]
+    },
+    "genx_ussad2add_sat": {
+      "opc": "ISA_SAD2ADD",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", 1 ],
+      "src1": [ "GENERAL", "SIGNED", 2 ],
+      "src2": [ "GENERAL", "UNSIGNED", 3 ]
+    },
+    "genx_ssad2": {
+      "opc": "ISA_SAD2",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", 1 ],
+      "src1": [ "GENERAL", "SIGNED", 2 ]
+    },
+    "genx_usad2": {
+      "opc": "ISA_SAD2",
+      "exec_size": [ "EXECSIZE_GE2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", 2 ]
+    },
+    "genx_wait": {
+      "opc": "ISA_WAIT",
+      "mask": [ "GENERAL", "UNSIGNED", 1 ]
+    },
+    "genx_avs": {
+      "opc": "ISA_AVS",
+      "channel_mask": [ "BYTE", 1 ],
+      "sampler": [ "SAMPLER", 2 ],
+      "surface": [ "SURFACE", 3 ],
+      "U_pixel_address": [ "GENERAL", 4 ],
+      "V_pixel_address": [ "GENERAL", 5 ],
+      "deltaU": [ "GENERAL", 6 ],
+      "deltaV": [ "GENERAL", 7 ],
+      "u2d": [ "GENERAL", 8 ],
+      "groupID": [ "GENERAL", "UNSIGNED", 9 ],
+      "verticalBlockNumber": [ "GENERAL", "UNSIGNED", 10 ],
+      "output_format_control": [ "BYTE", 11 ],
+      "v2d": [ "GENERAL", 12 ],
+      "execMode": [ "BYTE", 13 ],
+      "IEFByPass": [ "GENERAL", "UNSIGNED", 14 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_sample_unorm": {
+      "opc": "ISA_SAMPLE_UNORM",
+      "channel_mask": [ "BYTE", 1 ],
+      "sampler": [ "SAMPLER", 2 ],
+      "surface": [ "SURFACE", 3 ],
+      "U_pixel_address": [ "GENERAL", 4 ],
+      "V_pixel_address": [ "GENERAL", 5 ],
+      "deltaU": [ "GENERAL", 6 ],
+      "deltaV": [ "GENERAL", 7 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_sin": {
+      "opc": "ISA_SIN",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_cos": {
+      "opc": "ISA_COS",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_ssavg": {
+      "opc": "ISA_AVG",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_suavg": {
+      "opc": "ISA_AVG",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_usavg": {
+      "opc": "ISA_AVG",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_uuavg": {
+      "opc": "ISA_AVG",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_ssavg_sat": {
+      "opc": "ISA_AVG",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_suavg_sat": {
+      "opc": "ISA_AVG",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_usavg_sat": {
+      "opc": "ISA_AVG",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_uuavg_sat": {
+      "opc": "ISA_AVG",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_fence": {
+      "opc": "ISA_FENCE",
+      "mask": [ "BYTE", 1 ]
+    },
+    "genx_ssadd_sat": {
+      "opc": "ISA_ADD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_suadd_sat": {
+      "opc": "ISA_ADD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_usadd_sat": {
+      "opc": "ISA_ADD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_uuadd_sat": {
+      "opc": "ISA_ADD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_lzd": {
+      "opc": "ISA_LZD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ]
+    },
+    "genx_raw_send": {
+      "opc": "ISA_RAW_SEND",
+      "modifier_sendc": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 2 ],
+      "pred": [ "PREDICATION", 2 ],
+      "extended_message_descriptor": [ "INT", 3 ],
+      "numsrc": [ "NUMGRFS", 5 ],
+      "numdst": [ "NUMGRFS", 0 ],
+      "desc": [ "GENERAL", "UNSIGNED", 4 ],
+      "src": [ "RAW", 5 ],
+      "skip__": [ "TWOADDR", 6 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_raw_send_noresult": {
+      "opc": "ISA_RAW_SEND",
+      "modifier_sendc": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 2 ],
+      "pred": [ "PREDICATION", 2 ],
+      "extended_message_descriptor": [ "INT", 3 ],
+      "numsrc": [ "NUMGRFS", 5 ],
+      "numdst": [ "LITERAL", 0 ],
+      "desc": [ "GENERAL", "UNSIGNED", 4 ],
+      "src": [ "RAW", 5 ],
+      "dst": [ "NULLRAW" ]
+    },
+    "genx_raw_sends": {
+      "opc": "ISA_RAW_SENDS",
+      "modifier_sendc": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 2 ],
+      "pred": [ "PREDICATION", 2 ],
+      "numsrc": [ "NUMGRFS", 6 ],
+      "numsrc2": [ "NUMGRFS", 7 ],
+      "numdst": [ "NUMGRFS", 0 ],
+      "FFID": [ "BYTE", 3 ],
+      "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 4 ],
+      "desc": [ "GENERAL", "UNSIGNED", 5 ],
+      "src": [ "RAW", 6 ],
+      "src2": [ "RAW", 7 ],
+      "skip__": [ "TWOADDR", 8 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_raw_sends_noresult": {
+      "opc": "ISA_RAW_SENDS",
+      "modifier_sendc": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 2 ],
+      "pred": [ "PREDICATION", 2 ],
+      "numsrc": [ "NUMGRFS", 6 ],
+      "numsrc2": [ "NUMGRFS", 7 ],
+      "numdst": [ "LITERAL", 0 ],
+      "FFID": [ "BYTE", 3 ],
+      "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 4 ],
+      "desc": [ "GENERAL", "UNSIGNED", 5 ],
+      "src": [ "RAW", 6 ],
+      "src2": [ "RAW", 7 ],
+      "dst": [ "NULLRAW", 8 ]
+    },
+    "genx_raw_send2": {
+      "opc": "ISA_RAW_SENDS",
+      "modifier_sendc": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_BYTE", 2 ],
+      "pred": [ "PREDICATION", 3 ],
+      "numsrc": [ "BYTE", 4 ],
+      "numsrc2": [ "LITERAL", 0 ],
+      "numdst": [ "BYTE", 5 ],
+      "FFID": [ "BYTE", 6 ],
+      "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 7 ],
+      "desc": [ "GENERAL", "UNSIGNED", 8 ],
+      "src": [ "RAW", 9 ],
+      "src2": [ "NULLRAW", 10 ],
+      "skip__": [ "TWOADDR", 10 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_raw_send2_noresult": {
+      "opc": "ISA_RAW_SENDS",
+      "modifier_sendc": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_BYTE", 2 ],
+      "pred": [ "PREDICATION", 3 ],
+      "numsrc": [ "BYTE", 4 ],
+      "numsrc2": [ "LITERAL", 0 ],
+      "numdst": [ "LITERAL", 0 ],
+      "FFID": [ "BYTE", 5 ],
+      "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 6 ],
+      "desc": [ "GENERAL", "UNSIGNED", 7 ],
+      "src": [ "RAW", 8 ],
+      "src2": [ "NULLRAW", 9 ],
+      "dst": [ "NULLRAW", 10 ]
+    },
+    "genx_raw_sends2": {
+      "opc": "ISA_RAW_SENDS",
+      "modifier_sendc": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_BYTE", 2 ],
+      "pred": [ "PREDICATION", 3 ],
+      "numsrc": [ "BYTE", 4 ],
+      "numsrc2": [ "BYTE", 5 ],
+      "numdst": [ "BYTE", 6 ],
+      "FFID": [ "BYTE", 7 ],
+      "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 8 ],
+      "desc": [ "GENERAL", "UNSIGNED", 9 ],
+      "src": [ "RAW", 10 ],
+      "src2": [ "RAW", 11 ],
+      "skip__": [ "TWOADDR", 12 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_raw_sends2_noresult": {
+      "opc": "ISA_RAW_SENDS",
+      "modifier_sendc": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_BYTE", 2 ],
+      "pred": [ "PREDICATION", 3 ],
+      "numsrc": [ "BYTE", 4 ],
+      "numsrc2": [ "BYTE", 5 ],
+      "numdst": [ "LITERAL", 0 ],
+      "FFID": [ "BYTE", 6 ],
+      "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 7 ],
+      "desc": [ "GENERAL", "UNSIGNED", 8 ],
+      "src": [ "RAW", 9 ],
+      "src2": [ "RAW", 10 ],
+      "dst": [ "NULLRAW", 11 ]
+    },
+    "genx_rndd": {
+      "opc": "ISA_RNDD",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_rnde": {
+      "opc": "ISA_RNDE",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_rndu": {
+      "opc": "ISA_RNDU",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_rndz": {
+      "opc": "ISA_RNDZ",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_ssmul": {
+      "opc": "ISA_MUL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_sumul": {
+      "opc": "ISA_MUL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_usmul": {
+      "opc": "ISA_MUL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_uumul": {
+      "opc": "ISA_MUL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_ssmul_sat": {
+      "opc": "ISA_MUL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_sumul_sat": {
+      "opc": "ISA_MUL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_usmul_sat": {
+      "opc": "ISA_MUL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_uumul_sat": {
+      "opc": "ISA_MUL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_smulh": {
+      "opc": "ISA_MULH",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_umulh": {
+      "opc": "ISA_MULH",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_ssshl": {
+      "opc": "ISA_SHL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_sushl": {
+      "opc": "ISA_SHL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_usshl": {
+      "opc": "ISA_SHL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_uushl": {
+      "opc": "ISA_SHL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_ssshl_sat": {
+      "opc": "ISA_SHL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_sushl_sat": {
+      "opc": "ISA_SHL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_usshl_sat": {
+      "opc": "ISA_SHL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_uushl_sat": {
+      "opc": "ISA_SHL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ]
+    },
+    "genx_rol": {
+      "opc": "ISA_ROL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", 2 ]
+    },
+    "genx_ror": {
+      "opc": "ISA_ROR",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ],
+      "src1": [ "GENERAL", "UNSIGNED", 2 ]
+    },
+    "genx_sbfe": {
+      "opc": "ISA_BFE",
+      "exec_size": [ "EXECSIZE_NOT2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "OWALIGNED", "SIGNED", 0 ],
+      "src0": [ "GENERAL", "OWALIGNED", "SIGNED", 1 ],
+      "src1": [ "GENERAL", "OWALIGNED", "SIGNED", 2 ],
+      "src2": [ "GENERAL", "OWALIGNED", "SIGNED", 3 ]
+    },
+    "genx_ubfe": {
+      "opc": "ISA_BFE",
+      "exec_size": [ "EXECSIZE_NOT2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "OWALIGNED", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "OWALIGNED", "UNSIGNED", 1 ],
+      "src1": [ "GENERAL", "OWALIGNED", "UNSIGNED", 2 ],
+      "src2": [ "GENERAL", "OWALIGNED", "UNSIGNED", 3 ]
+    },
+    "genx_bfi": {
+      "opc": "ISA_BFI",
+      "exec_size": [ "EXECSIZE_NOT2" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "OWALIGNED", 0 ],
+      "src0": [ "GENERAL", "OWALIGNED", "UNSIGNED", 1 ],
+      "src1": [ "GENERAL", "OWALIGNED", "UNSIGNED", 2 ],
+      "src2": [ "GENERAL", "OWALIGNED", "UNSIGNED", 3 ],
+      "src3": [ "GENERAL", "OWALIGNED", "UNSIGNED", 4 ]
+    },
+    "genx_va_minmax": {
+      "opc": "ISA_VA",
+      "gen_opc": "ISA_VA_MINMAX_FOPCODE",
+      "sub_opc": [ "LITERAL", "MINMAX_FOPCODE" ],
+      "surface": [ "SURFACE", 1 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 2 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 3 ],
+      "Min_Max_Enable": [ "GENERAL", 4 ],
+      "Destination": [ "RAW", 0 ]
+    },
+    "genx_va_minmax_filter": {
+      "opc": "ISA_VA",
+      "gen_opc": "ISA_VA_MINMAXFILTER_FOPCODE",
+      "sub_opc": [ "LITERAL", "MINMAXFILTER_FOPCODE" ],
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 4 ],
+      "output_size": [ "BYTE", 5 ],
+      "return_data_format": [ "BYTE", 6 ],
+      "Min_Max_Enable": [ "GENERAL", 7 ],
+      "Destination": [ "RAW", 0 ]
+    },
+    "genx_va_centroid": {
+      "opc": "ISA_VA",
+      "gen_opc": "ISA_VA_Centroid_FOPCODE",
+      "sub_opc": [ "LITERAL", "Centroid_FOPCODE" ],
+      "surface": [ "SURFACE", 1 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 2 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 3 ],
+      "vSize": [ "GENERAL", 4 ],
+      "Destination": [ "RAW", 0 ]
+    },
+    "genx_va_bool_centroid": {
+      "opc": "ISA_VA",
+      "gen_opc": "ISA_VA_BoolCentroid_FOPCODE",
+      "sub_opc": [ "LITERAL", "BoolCentroid_FOPCODE" ],
+      "surface": [ "SURFACE", 1 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 2 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 3 ],
+      "vSize": [ "GENERAL", 4 ],
+      "hSize": [ "GENERAL", 5 ],
+      "Destination": [ "RAW", 0 ]
+    },
+    "genx_va_hdc_1pixel_convolve": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_1PIXELCONV",
+      "sub_opc": [ "LITERAL", "ISA_HDC_1PIXELCONV" ],
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 4 ],
+      "pixel_size": [ "BYTE", 5 ],
+      "offsets": [ "RAW", 6 ],
+      "destination_surface": [ "SURFACE", 7 ],
+      "destination_x_offset": [ "GENERAL", 8 ],
+      "destination_y_offset": [ "GENERAL", 9 ]
+    },
+    "genx_va_hdc_convolve2d": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_CONV",
+      "sub_opc": [ "LITERAL", "ISA_HDC_CONV" ],
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 4 ],
+      "properties": [ "BYTE", 5 ],
+      "destination_surface": [ "SURFACE", 6 ],
+      "destination_x_offset": [ "GENERAL", 7 ],
+      "destination_y_offset": [ "GENERAL", 8 ]
+    },
+    "genx_va_hdc_lbp_correlation": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_LBPCORRELATION",
+      "sub_opc": [ "LITERAL", "ISA_HDC_LBPCORRELATION" ],
+      "surface": [ "SURFACE", 1 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 2 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 3 ],
+      "horizontal_disparity": [ "GENERAL", 4 ],
+      "destination_surface": [ "SURFACE", 5 ],
+      "destination_x_offset": [ "GENERAL", 6 ],
+      "destination_y_offset": [ "GENERAL", 7 ]
+    },
+    "genx_va_hdc_lbp_creation": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_LBPCREATION",
+      "sub_opc": [ "LITERAL", "ISA_HDC_LBPCREATION" ],
+      "surface": [ "SURFACE", 1 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 2 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 3 ],
+      "mode": [ "BYTE", 4 ],
+      "destination_surface": [ "SURFACE", 5 ],
+      "destination_x_offset": [ "GENERAL", 6 ],
+      "destination_y_offset": [ "GENERAL", 7 ]
+    },
+    "genx_va_hdc_minmax_filter": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_MMF",
+      "sub_opc": [ "LITERAL", "ISA_HDC_MMF" ],
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 4 ],
+      "return_data_format": [ "BYTE", 5 ],
+      "minmax_enable_mode": [ "BYTE", 6 ],
+      "destination_surface": [ "SURFACE", 7 ],
+      "destination_x_offset": [ "GENERAL", 8 ],
+      "destination_y_offset": [ "GENERAL", 9 ]
+    },
+    "genx_va_correlation_search": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_VA_OP_CODE_CORRELATION_SEARCH",
+      "sub_opc": [ "LITERAL", "VA_OP_CODE_CORRELATION_SEARCH" ],
+      "surface": [ "SURFACE", 1 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 2 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_vertical_origin": [ "GENERAL", 4 ],
+      "normalized_horizontal_origin": [ "GENERAL", 5 ],
+      "x_direction_size": [ "GENERAL", 6 ],
+      "y_direction_size": [ "GENERAL", 7 ],
+      "x_direction_search_size": [ "GENERAL", 8 ],
+      "y_direction_search_size": [ "GENERAL", 9 ],
+      "Destination": [ "RAW", 0 ]
+    },
+    "genx_va_flood_fill": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_VA_OP_CODE_FLOOD_FILL",
+      "sub_opc": [ "LITERAL", "VA_OP_CODE_FLOOD_FILL" ],
+      "Is8Connect": [ "BYTE", 1 ],
+      "pixel_mask_horizontal_direction": [ "RAW", 2 ],
+      "pixel_mask_vertical_direction_left": [ "GENERAL", 3 ],
+      "pixel_mask_vertical_direction_right": [ "GENERAL", 4 ],
+      "loop_count": [ "GENERAL", 5 ],
+      "Destination": [ "RAW", 0 ]
+    },
+    "genx_va_lbp_correlation": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_VA_OP_CODE_LBP_CORRELATION",
+      "sub_opc": [ "LITERAL", "VA_OP_CODE_LBP_CORRELATION" ],
+      "surface": [ "SURFACE", 1 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 2 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 3 ],
+      "horizontal_disparity": [ "GENERAL", 4 ],
+      "Destination": [ "RAW", 0 ]
+    },
+    "genx_va_lbp_creation": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_VA_OP_CODE_LBP_CREATION",
+      "sub_opc": [ "LITERAL", "VA_OP_CODE_LBP_CREATION" ],
+      "surface": [ "SURFACE", 1 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 2 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 3 ],
+      "mode": [ "BYTE", 4 ],
+      "Destination": [ "RAW", 0 ]
+    },
+    "genx_3d_load": {
+      "opc": "ISA_3D_LOAD",
+      "sampling3d_opcode": [ "BYTE", 1 ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 2 ],
+      "pred": [ "PREDICATION", 2 ],
+      "channel_mask": [ "BYTE", 3 ],
+      "aoffimmi_value": [ "GENERAL", "UNSIGNED", 4 ],
+      "surface": [ "SURFACE", 5 ],
+      "dst": [ "RAW", 0 ],
+      "number_of_additional_operands": [ "ARGCOUNT", "ARGCOUNTMIN1", 6 ],
+      "raw_operands": [ "RAW_OPERANDS", "RAW", 6 ]
+    },
+    "genx_frc": {
+      "opc": "ISA_FRC",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "SATURATION_NOSAT", 0 ],
+      "src0": [ "GENERAL", 1 ]
+    },
+    "genx_va_convolve2d": {
+      "opc": "ISA_VA",
+      "gen_opc": "ISA_VA_Convolve_FOPCODE",
+      "sub_opc": "Convolve_FOPCODE",
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 4 ],
+      "properties": [ "BYTE", 5 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_va_erode": {
+      "opc": "ISA_VA",
+      "gen_opc": "ISA_VA_ERODE_FOPCODE",
+      "sub_opc": "ERODE_FOPCODE",
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 4 ],
+      "properties": [ "BYTE", 5 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_va_dilate": {
+      "opc": "ISA_VA",
+      "gen_opc": "ISA_VA_ERODE_FOPCODE",
+      "sub_opc": "Dilate_FOPCODE",
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 4 ],
+      "properties": [ "BYTE", 5 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_va_hdc_erode": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_ERODE",
+      "sub_opc": "ISA_HDC_ERODE",
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 4 ],
+      "dstSurface": [ "SURFACE", 5 ],
+      "xOffset": [ "GENERAL", 6 ],
+      "yOffset": [ "GENERAL", 7 ]
+    },
+    "genx_va_hdc_dilate": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_DILATE",
+      "sub_opc": "ISA_HDC_DILATE",
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "normalized_x_co_ordinate": [ "GENERAL", 3 ],
+      "normalized_y_co_ordinate": [ "GENERAL", 4 ],
+      "dstSurface": [ "SURFACE", 5 ],
+      "xOffset": [ "GENERAL", 6 ],
+      "yOffset": [ "GENERAL", 7 ]
+    },
+    "genx_barrier": {
+      "opc": "ISA_BARRIER",
+      "nobarrier": [ "ISBARRIER" ]
+    },
+    "genx_yield": {
+      "opc": "ISA_YIELD"
+    },
+    "genx_cache_flush": {
+      "opc": "ISA_SAMPLR_CACHE_FLUSH"
+    },
+    "genx_sbarrier": {
+      "opc": "ISA_SBARRIER",
+      "signal_flag": [ "BYTE", 1 ]
+    },
+    "genx_bfrev": {
+      "opc": "ISA_BFREV",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ]
+    },
+    "genx_cbit": {
+      "opc": "ISA_CBIT",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ]
+    },
+    "genx_ieee_div": {
+      "opc": "ISA_DIVM",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", 1 ],
+      "src1": [ "GENERAL", 2 ]
+    },
+    "genx_dp2": {
+      "opc": "ISA_DP2",
+      "exec_size": [ "EXECSIZE_GE4" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "STRIDE1", "OWALIGNED", 0 ],
+      "src0": [ "GENERAL", "STRIDE1", "OWALIGNED", 1 ],
+      "src1": [ "GENERAL", "STRIDE1", "OWALIGNED", 2 ]
+    },
+    "genx_dp3": {
+      "opc": "ISA_DP3",
+      "exec_size": [ "EXECSIZE_GE4" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "STRIDE1", "OWALIGNED", 0 ],
+      "src0": [ "GENERAL", "STRIDE1", "OWALIGNED", 1 ],
+      "src1": [ "GENERAL", "STRIDE1", "OWALIGNED", 2 ]
+    },
+    "genx_dp4": {
+      "opc": "ISA_DP4",
+      "exec_size": [ "EXECSIZE_GE4" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "STRIDE1", "OWALIGNED", 0 ],
+      "src0": [ "GENERAL", "STRIDE1", "OWALIGNED", 1 ],
+      "src1": [ "GENERAL", "STRIDE1", "OWALIGNED", 2 ]
+    },
+    "genx_dph": {
+      "opc": "ISA_DPH",
+      "exec_size": [ "EXECSIZE_GE4" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "STRIDE1", "OWALIGNED", 0 ],
+      "src0": [ "GENERAL", "STRIDE1", "OWALIGNED", 1 ],
+      "src1": [ "GENERAL", "STRIDE1", "OWALIGNED", 2 ]
+    },
+    "genx_sfbh": {
+      "opc": "ISA_FBH",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "SIGNED", 1 ]
+    },
+    "genx_ufbh": {
+      "opc": "ISA_FBH",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ]
+    },
+    "genx_fbl": {
+      "opc": "ISA_FBL",
+      "exec_size": [ "EXECSIZE" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "UNSIGNED", 0 ],
+      "src0": [ "GENERAL", "UNSIGNED", 1 ]
+    },
+    "genx_line": {
+      "opc": "ISA_LINE",
+      "exec_size": [ "EXECSIZE_GE4" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", "FIXED4", "NOIMM", 1 ],
+      "src1": [ "GENERAL", 2 ]
+    },
+    "genx_load": {
+      "opc": "ISA_LOAD",
+      "channel_mask": [ "SAMPLECHMASK", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "U_pixel_address": [ "RAW", 3 ],
+      "V_pixel_address": [ "RAW", 4 ],
+      "R_pixel_address": [ "RAW", 5 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_lrp": {
+      "opc": "ISA_LRP",
+      "exec_size": [ "EXECSIZE_GE4" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", "OWALIGNED", "CONTIGUOUS", 0 ],
+      "src0": [
+        "GENERAL",
+        "OWALIGNED",
+        "SCALARORCONTIGUOUS",
+        "NOIMM",
+        1
+      ],
+      "src1": [
+        "GENERAL",
+        "OWALIGNED",
+        "SCALARORCONTIGUOUS",
+        "NOIMM",
+        2
+      ],
+      "src2": [
+        "GENERAL",
+        "OWALIGNED",
+        "SCALARORCONTIGUOUS",
+        "NOIMM",
+        3
+      ]
+    },
+    "genx_pln": {
+      "opc": "ISA_PLANE",
+      "exec_size": [ "EXECSIZE_GE8" ],
+      "pred": [ "IMPLICITPRED" ],
+      "dst": [ "GENERAL", 0 ],
+      "src0": [ "GENERAL", "OWALIGNED", "FIXED4", "NOIMM", 1 ],
+      "src1": [ "GENERAL", "GRFALIGNED", "TWICEWIDTH", "NOIMM", 2 ]
+    },
+    "genx_sample": {
+      "opc": "ISA_SAMPLE",
+      "channel_mask": [ "SAMPLECHMASK", 1 ],
+      "sampler": [ "SAMPLER", 2 ],
+      "surface": [ "SURFACE", 3 ],
+      "U_pixel_address": [ "RAW", 4 ],
+      "V_pixel_address": [ "RAW", 5 ],
+      "R_pixel_address": [ "RAW", 6 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_svm_atomic_add": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_ADD" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_sub": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_SUB" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_min": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_MIN" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_max": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_MAX" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_xchg": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_XCHG" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_and": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_AND" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_or": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_OR" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_xor": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_XOR" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_imin": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_IMIN" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "SRAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "SRAW", 0 ]
+    },
+    "genx_svm_atomic_imax": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_IMAX" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "SRAW", 3 ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 4 ],
+      "dst": [ "SRAW", 0 ]
+    },
+    "genx_svm_atomic_inc": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_INC" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "NULLRAW" ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 3 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_dec": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_DEC" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "NULLRAW" ],
+      "src1": [ "NULLRAW" ],
+      "skip__": [ "TWOADDR", 3 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_atomic_cmpxchg": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_ATOMIC",
+      "skip": [ "LITERAL", "SVM_ATOMIC" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "sub_opc": [ "LITERAL", "ATOMIC_CMPXCHG" ],
+      "address": [ "URAW", 2 ],
+      "src0": [ "URAW", 3 ],
+      "src1": [ "URAW", 4 ],
+      "skip__": [ "TWOADDR", 5 ],
+      "dst": [ "URAW", 0 ]
+    },
+    "genx_svm_block_ld": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_BLOCK_LD",
+      "sub_opc": [ "LITERAL", "SVM_BLOCK_LD" ],
+      "log2_owords": [ "LOG2OWORDS", 0 ],
+      "address": [ "GENERAL", "UNSIGNED", 1 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_svm_block_ld_unaligned": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_BLOCK_LD",
+      "sub_opc": [ "LITERAL", "SVM_BLOCK_LD" ],
+      "log2_owords": [ "LOG2OWORDS_PLUS_8", 0 ],
+      "address": [ "GENERAL", "UNSIGNED", 1 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_svm_block_st": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_BLOCK_ST",
+      "sub_opc": [ "LITERAL", "SVM_BLOCK_ST" ],
+      "log2_owords": [ "LOG2OWORDS", 2 ],
+      "address": [ "GENERAL", "UNSIGNED", 1 ],
+      "src": [ "RAW", 2 ]
+    },
+    "genx_svm_gather": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_GATHER",
+      "sub_opc": [ "LITERAL", "SVM_GATHER" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "block_size_inferred_from_dst": [ "SVMGATHERBLOCKSIZE", 0 ],
+      "log2_num_blocks": [ "BYTE", 2 ],
+      "address": [ "URAW", 3 ],
+      "twoaddr": [ "TWOADDR", 4 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_svm_gather4_scaled": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_GATHER4SCALED",
+      "sub_opc": [ "LITERAL", "SVM_GATHER4SCALED" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "channel_mask": [ "BYTE", 2 ],
+      "scale": [ "SHORT", 3 ],
+      "address": [ "GENERAL", "UNSIGNED", 4 ],
+      "offset": [ "URAW", 5 ],
+      "twoaddr": [ "TWOADDR", 6 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_svm_scatter": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_SCATTER",
+      "sub_opc": [ "LITERAL", "SVM_SCATTER" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "block_size_inferred_from_dst": [ "SVMGATHERBLOCKSIZE", 4 ],
+      "log2_num_blocks": [ "BYTE", 2 ],
+      "address": [ "URAW", 3 ],
+      "src": [ "RAW", 4 ]
+    },
+    "genx_svm_scatter4_scaled": {
+      "opc": "ISA_SVM",
+      "gen_opc": "ISA_SVM_SVM_SCATTER4SCALED",
+      "sub_opc": [ "LITERAL", "SVM_SCATTER4SCALED" ],
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1 ],
+      "pred": [ "PREDICATION", 1 ],
+      "channel_mask": [ "BYTE", 2 ],
+      "scale": [ "SHORT", 3 ],
+      "address": [ "GENERAL", "UNSIGNED", 4 ],
+      "offset": [ "URAW", 5 ],
+      "src": [ "RAW", 6 ]
+    },
+    "genx_predefined_surface": {
+      "opc": "ISA_MOVS",
+      "exec_size": [ "EXECSIZE" ],
+      "dst": [ "SURFACE", 0 ],
+      "src0": [ "INT", 1 ],
+      "OPTIONS": [ "disable" ]
+    },
+    "genx_va_1pixel_convolve": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS__VA_OP_CODE_1PIXEL_CONVOLVE",
+      "sub_opc": "VA_OP_CODE_1PIXEL_CONVOLVE",
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "uOffset": [ "GENERAL", 3 ],
+      "vOffset": [ "GENERAL", 4 ],
+      "mode": [ "BYTE", 5 ],
+      "offsets": [ "RAW", 6 ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_va_1pixel_convolve_1x1mode": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS__VA_OP_CODE_1PIXEL_CONVOLVE",
+      "sub_opc": "VA_OP_CODE_1PIXEL_CONVOLVE",
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "uOffset": [ "GENERAL", 3 ],
+      "vOffset": [ "GENERAL", 4 ],
+      "mode": [ "LITERAL", 3 ],
+      "offsets": [ "NULLRAW" ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_va_1d_convolve_vertical": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS__VA_OP_CODE_1D_CONVOLVE_VH",
+      "sub_opc": "VA_OP_CODE_1D_CONVOLVE_VERTICAL",
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "uOffset": [ "GENERAL", 3 ],
+      "vOffset": [ "GENERAL", 4 ],
+      "mode": [ "BYTE", 5 ],
+      "direction": [ "LITERAL", "VA_V_DIRECTION" ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_va_1d_convolve_horizontal": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS__VA_OP_CODE_1D_CONVOLVE_VH",
+      "sub_opc": "VA_OP_CODE_1D_CONVOLVE_HORIZONTAL",
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "uOffset": [ "GENERAL", 3 ],
+      "vOffset": [ "GENERAL", 4 ],
+      "mode": [ "BYTE", 5 ],
+      "direction": [ "LITERAL", "VA_H_DIRECTION" ],
+      "dst": [ "RAW", 0 ]
+    },
+    "genx_va_hdc_1d_convolve_horizontal": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS__ISA_HDC_1DCONV_VH",
+      "sub_opc": [ "LITERAL", "ISA_HDC_1DCONV_H" ],
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "uOffset": [ "GENERAL", 3 ],
+      "vOffset": [ "GENERAL", 4 ],
+      "properties": [ "BYTE", 5 ],
+      "direction": [ "LITERAL", "VA_H_DIRECTION" ],
+      "dstSurface": [ "SURFACE", 6 ],
+      "xOffset": [ "GENERAL", 7 ],
+      "yOffset": [ "GENERAL", 8 ]
+    },
+    "genx_va_hdc_1d_convolve_vertical": {
+      "opc": "ISA_VA_SKL_PLUS",
+      "gen_opc": "ISA_VA_SKL_PLUS__ISA_HDC_1DCONV_VH",
+      "sub_opc": [ "LITERAL", "ISA_HDC_1DCONV_H" ],
+      "sampler": [ "SAMPLER", 1 ],
+      "surface": [ "SURFACE", 2 ],
+      "uOffset": [ "GENERAL", 3 ],
+      "vOffset": [ "GENERAL", 4 ],
+      "properties": [ "BYTE", 5 ],
+      "direction": [ "LITERAL", "VA_V_DIRECTION" ],
+      "dstSurface": [ "SURFACE", 6 ],
+      "xOffset": [ "GENERAL", 7 ],
+      "yOffset": [ "GENERAL", 8 ]
+    },
+    "genx_simdcf_get_em":{
+      "opc": "ISA_CMP_E",
+      "exec_size": [ "EXECSIZE_FROM_ARG", 1],
+      "src0": "CreateImmOpndFromUInt(ISA_TYPE_UB, 1)",
+      "src1": "CreateImmOpndFromUInt(ISA_TYPE_UB, 1)",
+      "dst" : [ "Z_PREDICATE", 0]
+    }
+  },
+
+  "OPCODE_GEN": {
+    "ISA_VA_SKL_PLUS__ISA_HDC_1DCONV_VH": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAHDCConvolve1D",
+          "sampler",
+          "surface",
+          "uOffset",
+          "vOffset",
+          "(HDCReturnFormat)properties",
+          "direction",
+          "dstSurface",
+          "xOffset",
+          "yOffset"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS__VA_OP_CODE_1D_CONVOLVE_VH": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAConvolve1D",
+          "sampler",
+          "surface",
+          "uOffset",
+          "vOffset",
+          "(CONVExecMode)mode",
+          "direction",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS__VA_OP_CODE_1PIXEL_CONVOLVE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAConvolve1Pixel",
+          "sampler",
+          "surface",
+          "uOffset",
+          "vOffset",
+          "(CONV1PixelExecMode)mode",
+          "offsets",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_MOV": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISADataMovementInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_MOVS": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISADataMovementInst",
+          "opc",
+          "nullptr",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "(VISA_VectorOpnd*)dst",
+          "src0",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_MEDIA_LD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessMediaLoadStoreInst",
+          "opc",
+          "(MEDIA_LD_mod)modifiers",
+          "surface",
+          "block_width",
+          "block_height",
+          "x_offset",
+          "y_offset",
+          "dst",
+          "(CISA_PLANE_ID)plane"
+        ]
+      ]
+    ],
+    "ISA_MEDIA_ST": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessMediaLoadStoreInst",
+          "opc",
+          "(MEDIA_LD_mod)modifiers",
+          "surface",
+          "block_width",
+          "block_height",
+          "x_offset",
+          "y_offset",
+          "src",
+          "(CISA_PLANE_ID)plane"
+        ]
+      ]
+    ],
+    "ISA_OWORD_LD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessOwordLoadStoreInst",
+          "opc",
+          "vISA_EMASK_M1",
+          "surface",
+          "log2_owords",
+          "offset",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_OWORD_ST": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessOwordLoadStoreInst",
+          "opc",
+          "vISA_EMASK_M1",
+          "surface",
+          "log2_owords",
+          "offset",
+          "src"
+        ]
+      ]
+    ],
+    "ISA_DWORD_ATOMIC": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessDwordAtomicInst",
+          "pred",
+          "sub_opc",
+          "false",
+          "exec_mask",
+          "exec_size",
+          "surface",
+          "offset",
+          "src",
+          "src1",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_MAD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "src2"
+        ]
+      ]
+    ],
+    "ISA_SETP": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASetP",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0"
+        ]
+      ]
+    ],
+    "ISA_FMINMAX": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAMinMaxInst",
+          "(CISA_MIN_MAX_SUB_OPCODE)flag_for_max",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1"
+        ]
+      ]
+    ],
+    "ISA_POW": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_ADDR_ADD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAAddrAddInst",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1"
+        ]
+      ]
+    ],
+    "ISA_3D_SAMPLE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISA3dSampler",
+          "(VISASampler3DSubOpCode)sampling3d_opcode",
+          "false",
+          "false",
+          "false",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "static_cast<VISAChannelMask>(channel_mask)",
+          "aoffimmi_value",
+          "sampler",
+          "surface",
+          "dst",
+          "number_of_additional_operands",
+          "raw_operands"
+        ]
+      ]
+    ],
+    "ISA_SQRT": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_RSQRT": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_SQRTM": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_INV": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_LOG": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_EXP": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_SCATTER_SCALED": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessScatterScaledInst",
+          "opc",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "(VISA_SVM_Block_Num)log2_num_blocks",
+          "surface",
+          "global_offset",
+          "element_offset",
+          "src"
+        ]
+      ]
+    ],
+    "ISA_SCATTER4_SCALED": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessGather4Scatter4ScaledInst",
+          "opc",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "convertChannelMaskToVisaType(channel_mask)",
+          "surface",
+          "global_offset",
+          "element_offset",
+          "src"
+        ]
+      ]
+    ],
+    "ISA_SCATTER4_TYPED": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessGather4Scatter4TypedInst",
+          "opc",
+          "pred",
+          "convertChannelMaskToVisaType(channel_mask)",
+          "exec_mask",
+          "exec_size",
+          "surface",
+          "U_pixel_address",
+          "V_pixel_address",
+          "R_pixel_address",
+          "LOD",
+          "src"
+        ]
+      ]
+    ],
+    "ISA_GATHER_SCALED": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessScatterScaledInst",
+          "opc",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "(VISA_SVM_Block_Num)log2_num_blocks",
+          "surface",
+          "global_offset",
+          "element_offset",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_GATHER4_SCALED": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessGather4Scatter4ScaledInst",
+          "opc",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "convertChannelMaskToVisaType(channel_mask)",
+          "surface",
+          "global_offset",
+          "element_offset",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_GATHER4_TYPED": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASurfAccessGather4Scatter4TypedInst",
+          "opc",
+          "pred",
+          "convertChannelMaskToVisaType(channel_mask)",
+          "exec_mask",
+          "exec_size",
+          "surface",
+          "U_pixel_address",
+          "V_pixel_address",
+          "R_pixel_address",
+          "LOD",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_3D_TYPED_ATOMIC": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISA3dTypedAtomic",
+          "sub_opc",
+          "sub_opc & (1<<5)",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "surface",
+          "U",
+          "V",
+          "R",
+          "LOD",
+          "src0",
+          "src1",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_SAD2ADD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "src2"
+        ]
+      ]
+    ],
+    "ISA_SAD2": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_WAIT": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAWaitInst",
+          "mask"
+        ]
+      ]
+    ],
+    "ISA_AVS": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAMEAVS",
+          "surface",
+          "sampler",
+          "(VISAChannelMask)channel_mask",
+          "U_pixel_address",
+          "V_pixel_address",
+          "deltaU",
+          "deltaV",
+          "u2d",
+          "v2d",
+          "groupID",
+          "verticalBlockNumber",
+          "(OutputFormatControl)output_format_control",
+          "(AVSExecMode)execMode",
+          "IEFByPass",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_SAMPLE_UNORM": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASISampleUnorm",
+          "surface",
+          "sampler",
+          "(VISAChannelMask)channel_mask",
+          "U_pixel_address",
+          "V_pixel_address",
+          "deltaU",
+          "deltaV",
+          "dst",
+          "getChannelOutputFormat(channel_mask)"
+        ]
+      ]
+    ],
+    "ISA_SIN": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_COS": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_AVG": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_FENCE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASyncInst",
+          "opc",
+          "mask"
+        ]
+      ]
+    ],
+    "ISA_ADD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_LZD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_RAW_SEND": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAMiscRawSend",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "modifier_sendc",
+          "extended_message_descriptor",
+          "numsrc",
+          "numdst",
+          "desc",
+          "src",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_RAW_SENDS": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAMiscRawSends",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "modifier_sendc",
+          "FFID",
+          "extended_message_descriptor",
+          "numsrc",
+          "numsrc2",
+          "numdst",
+          "desc",
+          "src",
+          "src2",
+          "dst",
+          "false"
+        ]
+      ]
+    ],
+    "ISA_RNDD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_RNDE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_RNDU": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_RNDZ": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_MUL": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_MULH": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_SHL": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_ROL": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_ROR": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_BFE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "src2",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_BFI": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "src2",
+          "src3"
+        ]
+      ]
+    ],
+    "ISA_VA_MINMAX_FOPCODE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAMinMax",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "Min_Max_Enable",
+          "Destination"
+        ]
+      ]
+    ],
+    "ISA_VA_MINMAXFILTER_FOPCODE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAMinMaxFilter",
+          "sampler",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "(OutputFormatControl)output_size",
+          "(MMFExecMode)return_data_format",
+          "Min_Max_Enable",
+          "Destination"
+        ]
+      ]
+    ],
+    "ISA_VA_Centroid_FOPCODE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVACentroid",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "vSize",
+          "Destination"
+        ]
+      ]
+    ],
+    "ISA_VA_BoolCentroid_FOPCODE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVABooleanCentroid",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "vSize",
+          "hSize",
+          "Destination"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_ISA_HDC_1PIXELCONV": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAHDCConvolve1Pixel",
+          "sampler",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "(HDCReturnFormat)pixel_size",
+          "offsets",
+          "destination_surface",
+          "destination_x_offset",
+          "destination_y_offset"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_ISA_HDC_CONV": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAHDCConvolve",
+          "sampler",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "(HDCReturnFormat)(properties & 0xf)",
+          "(CONVHDCRegionSize)(properties >> 4)",
+          "destination_surface",
+          "destination_x_offset",
+          "destination_y_offset"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_ISA_HDC_LBPCORRELATION": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAHDCLBPCorrelation",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "horizontal_disparity",
+          "destination_surface",
+          "destination_x_offset",
+          "destination_y_offset"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_ISA_HDC_LBPCREATION": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAHDCLBPCreation",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "(LBPCreationMode)mode",
+          "destination_surface",
+          "destination_x_offset",
+          "destination_y_offset"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_ISA_HDC_MMF": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAHDCMinMaxFilter",
+          "sampler",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "(HDCReturnFormat)return_data_format",
+          "(MMFEnableMode)minmax_enable_mode",
+          "destination_surface",
+          "destination_x_offset",
+          "destination_y_offset"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_VA_OP_CODE_CORRELATION_SEARCH": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVACorrelationSearch",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "normalized_vertical_origin",
+          "normalized_horizontal_origin",
+          "x_direction_size",
+          "y_direction_size",
+          "x_direction_search_size",
+          "y_direction_search_size",
+          "Destination"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_VA_OP_CODE_FLOOD_FILL": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAFloodFill",
+          "Is8Connect",
+          "pixel_mask_horizontal_direction",
+          "pixel_mask_vertical_direction_left",
+          "pixel_mask_vertical_direction_right",
+          "loop_count",
+          "Destination"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_VA_OP_CODE_LBP_CORRELATION": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVALBPCorrelation",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "horizontal_disparity",
+          "Destination"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_VA_OP_CODE_LBP_CREATION": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVALBPCreation",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "(LBPCreationMode)mode",
+          "Destination"
+        ]
+      ]
+    ],
+    "ISA_3D_LOAD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISA3dLoad",
+          "(VISASampler3DSubOpCode)sampling3d_opcode",
+          "false",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "convertChannelMaskToVisaType(channel_mask)",
+          "aoffimmi_value",
+          "surface",
+          "dst",
+          "number_of_additional_operands",
+          "raw_operands"
+        ]
+      ]
+    ],
+    "ISA_FRC": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_VA_Convolve_FOPCODE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAConvolve",
+          "sampler",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "(CONVExecMode)(properties & 0x3)",
+          "((properties >> 4) & 0x1)",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_VA_ERODE_FOPCODE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAErodeDilate",
+          "VA_DILATE",
+          "sampler",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "(EDExecMode)properties",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_ISA_HDC_ERODE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAHDCErodeDilate",
+          "VA_ERODE",
+          "sampler",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "dstSurface",
+          "xOffset",
+          "yOffset"
+        ]
+      ]
+    ],
+    "ISA_VA_SKL_PLUS_ISA_HDC_DILATE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAVAHDCErodeDilate",
+          "VA_DILATE",
+          "sampler",
+          "surface",
+          "normalized_x_co_ordinate",
+          "normalized_y_co_ordinate",
+          "dstSurface",
+          "xOffset",
+          "yOffset"
+        ]
+      ]
+    ],
+    "ISA_BARRIER": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASyncInst",
+          "opc",
+          "0"
+        ]
+      ]
+    ],
+    "ISA_YIELD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASyncInst",
+          "opc",
+          "0"
+        ]
+      ]
+    ],
+    "ISA_SAMPLR_CACHE_FLUSH": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASyncInst",
+          "opc",
+          "0"
+        ]
+      ]
+    ],
+    "ISA_SBARRIER": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASplitBarrierInst",
+          "signal_flag != 0"
+        ]
+      ]
+    ],
+    "ISA_BF_CVT": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISADataMovementInst",
+          "opc",
+          "nullptr",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_BFREV": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_CBIT": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_DIVM": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_DP2": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_DP3": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_DP4": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_DP4A": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "src2"
+        ]
+      ]
+    ],
+    "ISA_DPH": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_FBH": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_FBL": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISALogicOrShiftInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "nullptr",
+          "nullptr",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_LINE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_LOAD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASILoad",
+          "surface",
+          "convertChannelMaskToVisaType(channel_mask & 0xf)",
+          "(channel_mask >> 4) & 0x3",
+          "U_pixel_address",
+          "V_pixel_address",
+          "R_pixel_address",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_SAMPLE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASISample",
+          "vISA_EMASK_M1",
+          "surface",
+          "sampler",
+          "convertChannelMaskToVisaType(channel_mask)",
+          "(channel_mask >> 4) & 0x3",
+          "U_pixel_address",
+          "V_pixel_address",
+          "R_pixel_address",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_LRP": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "src2"
+        ]
+      ]
+    ],
+    "ISA_PLANE": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAArithmeticInst",
+          "opc",
+          "pred",
+          "Mod & MODIFIER_SAT",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1",
+          "nullptr"
+        ]
+      ]
+    ],
+    "ISA_SVM_SVM_ATOMIC": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASvmAtomicInst",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "sub_opc",
+          "32",
+          "address",
+          "src0",
+          "src1",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_SVM_SVM_BLOCK_LD": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASvmBlockLoadInst",
+          "VISA_Oword_Num(log2_owords & 0x7)",
+          "(log2_owords & 8)",
+          "address",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_SVM_SVM_BLOCK_ST": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASvmBlockStoreInst",
+          "VISA_Oword_Num(log2_owords)",
+          "(log2_owords & 8)",
+          "address",
+          "src"
+        ]
+      ]
+    ],
+    "ISA_SVM_SVM_GATHER": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASvmGatherInst",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "(VISA_SVM_Block_Type)block_size_inferred_from_dst",
+          "(VISA_SVM_Block_Num)log2_num_blocks",
+          "address",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_SVM_SVM_GATHER4SCALED": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASvmGather4ScaledInst",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "convertChannelMaskToVisaType(channel_mask)",
+          "address",
+          "offset",
+          "dst"
+        ]
+      ]
+    ],
+    "ISA_SVM_SVM_SCATTER": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASvmScatterInst",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "(VISA_SVM_Block_Type)block_size_inferred_from_dst",
+          "(VISA_SVM_Block_Num)log2_num_blocks",
+          "address",
+          "src"
+        ]
+      ]
+    ],
+    "ISA_SVM_SVM_SCATTER4SCALED": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISASvmScatter4ScaledInst",
+          "pred",
+          "exec_mask",
+          "exec_size",
+          "convertChannelMaskToVisaType(channel_mask)",
+          "address",
+          "offset",
+          "src"
+        ]
+      ]
+    ],
+    "ISA_CMP_E": [
+      [ "CISA_CALL",
+        [ "Kernel->AppendVISAComparisonInst",
+          "opc",
+          "exec_mask",
+          "exec_size",
+          "dst",
+          "src0",
+          "src1"
+        ]
+      ]
+    ]
+  },
+  "ARGUMENTS_GEN": {
+    "EXECSIZE":     "GetExecSize(II::ArgInfo({args}), &exec_mask)",
+    "EXECSIZE_GE2": "GetExecSize(II::ArgInfo({args}), &exec_mask)",
+    "EXECSIZE_GE4": "GetExecSize(II::ArgInfo({args}), &exec_mask)",
+    "EXECSIZE_GE8": "GetExecSize(II::ArgInfo({args}), &exec_mask)",
+    "EXECSIZE_NOT2": "GetExecSize(II::ArgInfo({args}), &exec_mask)",
+    "EXECSIZE_NOMASK": "GetExecSize(II::ArgInfo({args}), &exec_mask)",
+    "EXECSIZE_FROM_ARG": "GetExecSizeFromArg(II::ArgInfo({args}), &exec_mask)",
+    "EXECSIZE_FROM_BYTE": "GetExecSizeFromByte(II::ArgInfo({args}), &exec_mask)",
+    "NULLRAW":      "CreateNullRawOperand(II::ArgInfo({args}))",
+    "MEDIAHEIGHT":  "GetMediaHeght(II::ArgInfo({args}))",
+    "IMPLICITPRED": "CreateImplicitPredication(II::ArgInfo({args}))",
+    "GENERAL":      "CreateOperand(II::ArgInfo({args}))",
+    "ADDRESS":      "CreateAddressOperand(II::ArgInfo({args}))",
+    "RAW":          "CreateRawOperand(II::ArgInfo({args}))",
+    "URAW":         "CreateRawOperand(II::ArgInfo({args} | II::RAW_UNSIGNED))",
+    "SRAW":         "CreateRawOperand(II::ArgInfo({args} | II::RAW_SIGNED))",
+    "SURFACE":      "CreateSurfaceOperand(II::ArgInfo({args}))",
+    "SAMPLER":      "CreateSamplerOperand(II::ArgInfo({args}))",
+    "PREDICATION":  "CreatePredication(II::ArgInfo({args}))",
+    "PREDICATE":    "GetPredicateVar(II::ArgInfo({args}))",
+    "Z_PREDICATE":  "GetZeroedPredicateVar(II::ArgInfo({args}))",
+    "BYTE":         "GetUnsignedValue(II::ArgInfo({args}))",
+    "SHORT":        "GetUnsignedValue(II::ArgInfo({args}))",
+    "INT":          "GetUnsignedValue(II::ArgInfo({args}))",
+    "LOG2OWORDS":   "GetOwords(II::ArgInfo({args}))",
+    "LOG2OWORDS_PLUS_8": "GetOwords(II::ArgInfo({args})) + 8",
+    "SVMGATHERBLOCKSIZE": "GetSvmGatherBlockSize(II::ArgInfo({args}))",
+    "TWOADDR":      "ProcessTwoAddr(II::ArgInfo({args}))",
+    "CONSTVI1ASI32": "ConstVi1Asi32(II::ArgInfo({args}))",
+    "ARGCOUNT":     "GetArgCount(II::ArgInfo({args}))",
+    "NUMGRFS":      "GetNumGrfs(II::ArgInfo({args}))",
+    "SAMPLECHMASK": "GetSampleChMask(II::ArgInfo({args}))",
+    "RAW_OPERANDS": ["VISA_RawOpnd* {dst}[16]", "CreateRawOperands(II::ArgInfo({args}), {dst})"],
+    "LITERAL":      "{value1}",
+    "ISBARRIER":    "HasBarrier = true",
+    "SKIP":         null
+  }
+}
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.py b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.py
new file mode 100755
index 000000000000..2e099df34b8c
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.py
@@ -0,0 +1,230 @@
+#!/usr/bin/python3
+"""
+Usage: cisa_gen_intrinsics.py <input_file> <output_path>
+
+This script gets intrinsics description from JSON file specified by <input_file> argument
+and generates two files GenXIntrinsicInfoTable.inc and GenXIntrinsicsBuildMap.inc into
+path specified by <output_path> argument.
+
+JSON file must contain following mandatory fields: INTRINSICS, OPCODE_GEN and ARGUMENTS_GEN.
+
+*** Field INTRINSICS
+  Contains description of all intrinsics. Each intrinsic is described in following format:
+  intrinsic_name : {
+    opc: VISA opcode corresponding to the intrinsic
+    gen_opc: optional field, it aims to distinguish generators of complex opcodes which may
+             contain sub-opcode field
+    OPTIONS: list of intrinsics options. Currently, supported only 'disable' value, which means
+             that intrinsic will be skipped at all.
+    <ARGUMENT>: see description below
+  }
+
+  Each argument is a [key: list] format, where key is a name of Argument, list is a command
+  for generator.
+  First field of generator command is a generator name, it tells how to generate code for
+  fetching an argument value. Each argument generator is described in ARGUMENTS_GEN map.
+
+  For example:
+      "Surface": ["GENERAL", "UNSIGNED", 10],
+  Here GENERAL is generator name by which will be determined (from "ARGUMENTS_GEN") what code
+  to generate for getting argument value.
+  Generated code:
+      auto Surface = CreateOperand(II::ArgInfo(UNSIGNED | 10));
+  or for GenXIntrinsicInfoTable.inc:
+      GENERAL | UNSIGNED | 10,
+
+  To add new intrinsic you need to add new description into INTRINSICS map. If it contains
+  opcode which is absent in opcode_map you also need to add item for new opcode to OPCODE_GEN.
+
+  For example, lets add new intrinsic with new opcode and one new argument generator(NEW_PREDICATION):
+  "INTRINSICS":
+    "genx_new": {
+      "opc": "ISA_NEW",
+      "exec_size": ["EXECSIZE_FROM_ARG", 1],
+      "pred": ["NEW_PREDICATION", 1],
+      "DataOrder": ["BYTE", 5],
+      "Surface": ["GENERAL", "UNSIGNED", 10],
+      "DstData": ["RAW", 0],
+      "Src1Data": ["NULLRAW"]
+    },
+  "OPCODE_GEN":
+    ISA_NEW: "CISA_CALL(Kernel->AppendNew(exec_size, pred, DataOrder, Src1Data, DstData, Surface));"
+  "ARGUMENTS_GEN":
+    "NEW_PREDICATION": "CreateNewPredication(II::ArgInfo({args}))",
+  Also we need to add new function or lambda with name CreateNewPredication to GenXCisaBuilder.cpp
+
+*** Field ARGUMENTS_GEN
+  It is needed only to generate CISA building code (GenXIntrinsicsBuildMap.inc)
+  Pattern keys that can be used inside generator:
+  args   - string with arguments that are passed to ArgInfo constructor.
+  value1 - first value in argument list, needed for LITERAL generator
+  dst    - name of a variable to which will be assigned argument value
+
+*** Field OPCODE_GEN
+  It is needed only to generate CISA building code (GenXIntrinsicsBuildMap.inc)
+  Final part of generated code for a single intrinsic is a call of Finalizer's function that builds
+  instruction itself. So, all items of this map is just map from opcode to the build function.
+  Opcode may be not real VISA opcode, for example, ISA_VA_SKL_PLUS has different functions to build
+  instructions with different signatures, which depends of its sub-opcode. Thus there are maybe
+  compound opcodes for such cases.
+"""
+
+import sys
+import re
+import json
+from collections import OrderedDict
+
+
+HEADER = '''/******************************************************************************
+ * AUTOGENERATED FILE, DO NOT EDIT!
+ * Generated by GenXUtilBuild project
+ */
+'''
+
+def open_and_delete_comments(dscr_filename):
+    with open(dscr_filename, "r") as jsonfile:
+        data = jsonfile.readlines()
+    jsonwithoutcomments = filter(lambda line: "//" not in line, data)
+    stringjson = "".join(jsonwithoutcomments)
+    return stringjson;
+
+def generate(dscr_filename, out_path):
+    special_keys = ('gen_opc', 'OPTIONS')
+    descr = json.loads(open_and_delete_comments(dscr_filename), object_pairs_hook=OrderedDict)
+    opcode_gen = descr['OPCODE_GEN']
+    arguments_gen = descr['ARGUMENTS_GEN']
+    intrinsics = descr['INTRINSICS']
+
+    # Convert list to function call string
+    # Example: [ Func, arg1, arg2] to Func(arg1, arg2)
+    def gen2str(value):
+        if isinstance(value, list):
+            args = []
+            for v in value[1:]:
+                args.append(gen2str(v))
+            return "{}({})".format(value[0], ', '.join(args))
+        return str(value)
+
+    # Recursively search regex in lists
+    def gen_search(value, regex):
+        if isinstance(value, list):
+            for v in value:
+                if gen_search(v, regex):
+                    return True
+            return False
+        return bool(re.search(regex, value))
+
+    def isstrinst(opc_gen):
+        isalt = True
+        if sys.version_info[0] >= 3:
+            isalt = isinstance(opc_gen, bytes)
+        else:
+            isalt = isinstance(opc_gen, unicode)
+        return bool(isinstance(opc_gen, str) or isalt)
+
+    with open(out_path + '/GenXIntrinsicInfoTable.inc', 'w') as file:
+        file.write(HEADER)
+        for name, intr in intrinsics.items():
+            if 'OPTIONS' in intr and 'disable' in intr['OPTIONS']:
+                continue
+            if name == "fma":
+                file.write('Intrinsic::{},\n'.format(name))
+            else:
+                file.write('GenXIntrinsic::{},\n'.format(name))
+            for key, value in intr.items():
+                if key in special_keys:
+                    continue
+                elif key in ('opc'):
+                    file.write('LITERAL | {},\n'.format(value))
+                elif isinstance(value, list):
+                    file.write('{},\n'.format(' | '.join([str(x) for x in value if x != 'RAW_OPERANDS'])))
+                else:
+                    # skip other
+                    pass
+            file.write('END,\n\n')
+
+
+    with open(out_path + '/GenXIntrinsicsBuildMap.inc', 'w') as file:
+        file.write(HEADER)
+        file.write('switch(IntrinID) {\n\n')
+
+        for name, intr in intrinsics.items():
+            gen_opc = intr.get('gen_opc')
+            if not gen_opc:
+                gen_opc = intr['opc']
+
+            opc_gen = opcode_gen.get(gen_opc)
+            if not opc_gen:
+                print(intr)
+                raise RuntimeError("Instruction generator not found")
+            if isstrinst(opc_gen):
+                opc_gen = [opc_gen]
+            assert isinstance(opc_gen, list)
+
+            if 'OPTIONS' in intr and 'disable' in intr['OPTIONS']:
+                continue
+
+            if name == "fma":
+                file.write('  case llvm::Intrinsic::' + name + ': {\n')
+            else:
+                file.write('  case llvm::GenXIntrinsic::' + name + ': {\n')
+
+            for key, value in intr.items():
+                if key in special_keys:
+                    continue
+
+                # no_assign means that there is no variable that need to be assigned
+                no_assign = key in ('twoaddr', 'nobarrier')
+
+                # skip items that are not exist in generator string
+                if not no_assign and not gen_search(opc_gen, r'\b%s\b'%key):
+                    continue
+
+                if key == 'opc':
+                    replace = value
+                elif isinstance(value, list):
+                    replace = arguments_gen.get(value[0])
+                    if not replace:
+                        print(value)
+                        raise RuntimeError('Key not found!')
+                    if not replace:
+                        continue
+                    context = { 'value1': value[1] if len(value) > 1 else None, 'dst': key,
+                                'args': '{}'.format(' | ').join(
+                                ['II::' + x if isstrinst(x)
+                                            else str(x) for x in value if x != 'RAW_OPERANDS']) }
+                    if isinstance(replace, list):
+                        replace = [x.format(**context) for x in replace]
+                    else:
+                        replace = replace.format(**context)
+                else:
+                    replace = value
+                assert replace, 'Unknown token'
+
+                if isinstance(replace, list):
+                    for replace_item in replace:
+                        file.write('    ' + replace_item + ';\n')
+                else:
+                    assign = '' if no_assign else 'auto ' + key + ' = '
+                    file.write('    ' + assign + replace + ';\n')
+
+            for g in opc_gen:
+                file.write('    ' + gen2str(g) + ';\n')
+            file.write('  } break;\n\n')
+
+        file.write('''  default:
+    CI->print(errs());
+    errs() << '\\n';
+    report_fatal_error("Unsupported intrinsic!");
+    break;
+}''')
+
+def main():
+    if len(sys.argv) > 1 and sys.argv[1] == '--help':
+        print(__doc__)
+        sys.exit(0)
+    assert len(sys.argv) > 2, "Missing arguments! Usage: cisa_gen_intrinsics.py <INPUT_FILE> <OUTPUT_PATH>"
+    generate(sys.argv[1], sys.argv[2])
+
+if __name__ == '__main__':
+    main()
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/ConstantFoldingGenX.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/ConstantFoldingGenX.cpp
new file mode 100644
index 000000000000..7be7ca29da15
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/ConstantFoldingGenX.cpp
@@ -0,0 +1,285 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file defines a routine for folding a GenX intrinsic call into a constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "vc/GenXOpts/GenXAnalysis.h"
+#include "vc/GenXOpts/Utils/CMRegion.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "genx-constantfolding"
+
+using namespace llvm;
+
+/***********************************************************************
+ * canConstantFoldGenXIntrinsic : Return true if it is even possible to fold
+ *     a call to the specified GenX intrinsic
+ */
+bool llvm::canConstantFoldGenXIntrinsic(unsigned IID)
+{
+  switch (IID) {
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf:
+    // The wrregion case specifically excludes genx_wrconstregion
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+    case GenXIntrinsic::genx_all:
+    case GenXIntrinsic::genx_any:
+      return true;
+  }
+  return false;
+}
+
+/***********************************************************************
+ * constantFoldRdRegion : attempt to constant fold rdregion
+ */
+static Constant *constantFoldRdRegion(Type *RetTy,
+                                      ArrayRef<Constant *> Operands,
+                                      const CMRegion &R, const DataLayout *DL) {
+  Constant *Input = Operands[GenXIntrinsic::GenXRegion::OldValueOperandNum];
+  // The input can be a ConstantExpr if we are being called from
+  // CallAnalyzer.
+  if (isa<ConstantExpr>(Input))
+    return nullptr;
+  // If the input value is undef, just return undef.
+  if (isa<UndefValue>(Input))
+    return UndefValue::get(RetTy);
+  // Parse the region parameters.
+  unsigned WholeNumElements = Input->getType()->getVectorNumElements();
+  auto OffsetC = dyn_cast<Constant>(
+      Operands[GenXIntrinsic::GenXRegion::RdIndexOperandNum]);
+  if (!OffsetC)
+    return nullptr;
+  int RetElemSize = RetTy->getScalarType()->getPrimitiveSizeInBits() / 8;
+  if (!RetElemSize) {
+    assert(RetTy->getScalarType()->isPointerTy() &&
+           RetTy->getScalarType()->getPointerElementType()->isFunctionTy());
+    RetElemSize = DL->getTypeSizeInBits(RetTy) / 8;
+  }
+  unsigned Offset = 0;
+  if (!isa<VectorType>(OffsetC->getType()))
+    Offset = dyn_cast<ConstantInt>(OffsetC)->getZExtValue() / RetElemSize;
+  else
+    assert(OffsetC->getType()->getVectorNumElements() == R.NumElements);
+  if (Offset >= WholeNumElements)
+    return UndefValue::get(RetTy); // out of range index
+  if (!isa<VectorType>(RetTy))
+    return Input->getAggregateElement(Offset);
+  // Gather the elements of the region being read.
+  SmallVector<Constant *, 8> Values;
+  unsigned RowIdx = Offset;
+  unsigned Idx = RowIdx;
+  unsigned NextRow = R.Width;
+  for (unsigned i = 0; i != R.NumElements; ++i) {
+    if (i == NextRow) {
+      NextRow += R.Width;
+      RowIdx += R.VStride;
+      Idx = RowIdx;
+    }
+    if (isa<VectorType>(OffsetC->getType())) {
+      auto EltOffset = 
+        dyn_cast<ConstantInt>(OffsetC->getAggregateElement(i))->getZExtValue();
+      EltOffset = EltOffset / 
+        (RetTy->getScalarType()->getPrimitiveSizeInBits() / 8);
+      Idx += EltOffset;
+    }
+    if (Idx >= WholeNumElements)
+      // push undef value if idx is out of bounds
+      Values.push_back(UndefValue::get(RetTy->getScalarType()));
+    else
+      // Get the element value and push it into Values.
+      Values.push_back(Input->getAggregateElement(Idx));
+    Idx += R.Stride;
+  }
+  return ConstantVector::get(Values);
+}
+
+/***********************************************************************
+ * constantFoldWrRegion : attempt to constant fold Wrregion
+ */
+static Constant *constantFoldWrRegion(Type *RetTy,
+                                      ArrayRef<Constant *> Operands,
+                                      const CMRegion &R, const DataLayout *DL) {
+  Constant *OldValue = Operands[GenXIntrinsic::GenXRegion::OldValueOperandNum];
+  Constant *NewValue = Operands[GenXIntrinsic::GenXRegion::NewValueOperandNum];
+  // The inputs can be ConstantExpr if we are being called from
+  // CallAnalyzer.
+  if (isa<ConstantExpr>(OldValue) || isa<ConstantExpr>(NewValue))
+    return nullptr;
+  assert(RetTy == OldValue->getType());
+  auto OffsetC =
+      dyn_cast<ConstantInt>(Operands[GenXIntrinsic::GenXRegion::WrIndexOperandNum]);
+  if (!OffsetC)
+    return nullptr; // allow for but do not const fold when index is vector
+  int RetElemSize = RetTy->getScalarType()->getPrimitiveSizeInBits() / 8;
+  if (!RetElemSize) {
+    assert(RetTy->getScalarType()->isPointerTy() &&
+           RetTy->getScalarType()->getPointerElementType()->isFunctionTy());
+    RetElemSize = DL->getTypeSizeInBits(RetTy) / 8;
+  }
+  unsigned Offset = OffsetC->getSExtValue() / RetElemSize;
+  if (isa<UndefValue>(OldValue) && R.isContiguous() && (Offset == 0)) {
+    // If old value is undef and new value is splat, and the result vector
+    // is no bigger than 2 GRFs, then just return a splat of the right type.
+    Constant *Splat = NewValue;
+    if (isa<VectorType>(NewValue->getType()))
+      Splat = NewValue->getSplatValue();
+    if (Splat)
+      if (RetTy->getPrimitiveSizeInBits() <= 2 * 32 * 8)
+        return ConstantVector::getSplat(RetTy->getVectorNumElements(), Splat);
+    // If new value fills the whole vector, just return the new value.
+    if (NewValue->getType() == RetTy)
+      return NewValue;
+  }
+  unsigned WholeNumElements = RetTy->getVectorNumElements();
+  // Gather the elements of the old value.
+  SmallVector<Constant *, 8> Values;
+  for (unsigned i = 0; i != WholeNumElements; ++i)
+    Values.push_back(OldValue->getAggregateElement(i));
+  // Insert the elements of the new value.
+  if (Offset >= Values.size())
+    return UndefValue::get(RetTy); // out of range index
+  if (!isa<VectorType>(NewValue->getType()))
+    Values[Offset] = NewValue;
+  else {
+    unsigned RowIdx = Offset;
+    unsigned Idx = RowIdx;
+    unsigned NextRow = R.Width;
+    for (unsigned i = 0; i != R.NumElements; ++i) {
+      if (i == NextRow) {
+        NextRow += R.Width;
+        RowIdx += R.VStride;
+        Idx = RowIdx;
+      }
+      if (Idx >= WholeNumElements)
+        // return collected values even if idx is out of bounds
+        return ConstantVector::get(Values);
+      Values[Idx] = NewValue->getAggregateElement(i);
+      Idx += R.Stride;
+    }
+  }
+  return ConstantVector::get(Values);
+}
+
+/***********************************************************************
+ * constantFoldAll : constant fold llvm.genx.all
+ * constantFoldAny : constant fold llvm.genx.any
+ */
+static Constant *constantFoldAll(Type *RetTy, Constant *In)
+{
+  if (In->isAllOnesValue())
+    return Constant::getAllOnesValue(RetTy);
+  return Constant::getNullValue(RetTy);
+}
+static Constant *constantFoldAny(Type *RetTy, Constant *In)
+{
+  if (!In->isNullValue())
+    return Constant::getAllOnesValue(RetTy);
+  return Constant::getNullValue(RetTy);
+}
+
+/***********************************************************************
+ * ConstantFoldGenXIntrinsic : attempt to constant fold a call to the
+ *    specified GenX intrinsic with the specified arguments, returning null if
+ *    unsuccessful
+ */
+Constant *llvm::ConstantFoldGenXIntrinsic(unsigned IID, Type *RetTy,
+    ArrayRef<Constant *> Operands, ImmutableCallSite CS, const DataLayout *DL)
+{
+  Instruction *I = const_cast<Instruction *>(CS.getInstruction());
+  switch (IID) {
+  case GenXIntrinsic::genx_rdregioni:
+  case GenXIntrinsic::genx_rdregionf: {
+    CMRegion R(I);
+    return constantFoldRdRegion(RetTy, Operands, R, DL);
+  }
+  // The wrregion case specifically excludes genx_wrconstregion
+  case GenXIntrinsic::genx_wrregioni:
+  case GenXIntrinsic::genx_wrregionf: {
+    CMRegion R(I);
+    return constantFoldWrRegion(RetTy, Operands, R, DL);
+  }
+  case GenXIntrinsic::genx_all:
+    return constantFoldAll(RetTy, Operands[0]);
+  case GenXIntrinsic::genx_any:
+    return constantFoldAny(RetTy, Operands[0]);
+  }
+  return nullptr;
+}
+
+/***********************************************************************
+ * ConstantFoldGenX : attempt to constant fold genx intrinsics including
+ * its arguments, returning null if unsuccessful.
+ */
+Constant *llvm::ConstantFoldGenX(Instruction *I, const DataLayout &DL) {
+  LLVM_DEBUG(dbgs() << "Trying to fold " << *I << "\n");
+  auto IID = GenXIntrinsic::getGenXIntrinsicID(I);
+  if (!canConstantFoldGenXIntrinsic(IID)) {
+    LLVM_DEBUG(dbgs() << "Fail: not a genx intrinsic\n");
+    return nullptr;
+  }
+
+  CallSite CS{I};
+  auto CheckConst = [](const Use &A) {
+    Value *V = A.get();
+    bool IsConst = isa<Constant>(V);
+    if (!IsConst)
+      LLVM_DEBUG(dbgs() << "Fail: operand " << *V << " is not a constant\n");
+    return IsConst;
+  };
+  if (!std::all_of(CS.arg_begin(), CS.arg_end(), CheckConst))
+    return nullptr;
+
+  SmallVector<Constant *, 4> ConstantArgs;
+  ConstantArgs.reserve(CS.arg_size());
+  auto FoldOperand = [&DL](const Use &A) {
+    auto *C = cast<Constant>(A.get());
+    Constant *Folded = ConstantFoldConstant(C, DL);
+    if (Folded)
+      LLVM_DEBUG(dbgs() << "Folded operand " << *C << " to " << *Folded
+                        << "\n");
+    return Folded ? Folded : C;
+  };
+  std::transform(CS.arg_begin(), CS.arg_end(), std::back_inserter(ConstantArgs),
+                 FoldOperand);
+
+  Constant *Folded = ConstantFoldGenXIntrinsic(
+      IID, CS.getFunctionType()->getReturnType(), ConstantArgs, CS, &DL);
+  if (Folded)
+    LLVM_DEBUG(dbgs() << "Successfully constant folded intruction to "
+                      << *Folded << "\n");
+  else
+    LLVM_DEBUG(dbgs() << "Failed to constant fold instruction\n");
+  return Folded;
+}
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/InstructionSimplifyGenX.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/InstructionSimplifyGenX.cpp
new file mode 100644
index 000000000000..34fd453c9cf0
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/InstructionSimplifyGenX.cpp
@@ -0,0 +1,269 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// This file defines a routine for simplifying a GenX intrinsic call to a
+// constant or one of the operands. This is for cases where not all operands
+// are constant; the constant operand cases are handled in ConstantFoldGenX.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "vc/GenXOpts/GenXAnalysis.h"
+#include "vc/GenXOpts/GenXOpts.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+#define DEBUG_TYPE "genx-simplify"
+
+using namespace llvm;
+
+/***********************************************************************
+ * SimplifyGenXIntrinsic : given a GenX intrinsic and a set of arguments,
+ * see if we can fold the result.
+ *
+ * ConstantFoldingGenX.cpp handles pure constant folding cases. This code
+ * only handles cases where not all operands are constant, but we can do
+ * some folding anyway.
+ *
+ * If this call could not be simplified, returns null.
+ */
+Value *llvm::SimplifyGenXIntrinsic(unsigned IID, Type *RetTy, Use *ArgBegin,
+                                   Use *ArgEnd) {
+  switch (IID) {
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf:
+      // Identity rdregion can be simplified to its "old value" input.
+      if (RetTy
+          == ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum]->getType()) {
+        unsigned NumElements = RetTy->getVectorNumElements();
+        unsigned Width = cast<ConstantInt>(
+              ArgBegin[GenXIntrinsic::GenXRegion::RdWidthOperandNum])
+            ->getZExtValue();
+        auto IndexV = dyn_cast<Constant>(
+          ArgBegin[GenXIntrinsic::GenXRegion::RdIndexOperandNum]);
+        if (!IndexV)
+          return nullptr;
+        unsigned Index = 0;
+        if (!isa<VectorType>(IndexV->getType()))
+          Index = dyn_cast<ConstantInt>(IndexV)->getZExtValue()
+          / (RetTy->getScalarType()->getPrimitiveSizeInBits() / 8);
+        else
+          return nullptr;
+        if ((Index == 0 || Index >= NumElements) &&
+            (Width == NumElements || Width == cast<ConstantInt>(ArgBegin[
+             GenXIntrinsic::GenXRegion::RdVStrideOperandNum])->getSExtValue()))
+          if (NumElements == 1 || cast<ConstantInt>(ArgBegin[
+                GenXIntrinsic::GenXRegion::RdStrideOperandNum])->getSExtValue())
+            return ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum];
+      }
+      // rdregion with splatted constant input can be simplified to a constant of
+      // the appropriate type, ignoring the possibly variable index.
+      if (auto C = dyn_cast<Constant>(
+            ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum]))
+        if (auto Splat = C->getSplatValue()) {
+          if (auto VT = dyn_cast<VectorType>(RetTy))
+            return ConstantVector::getSplat(VT->getNumElements(), Splat);
+          return Splat;
+        }
+      break;
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+      // The wrregion case specifically excludes genx_wrconstregion.
+      // Identity wrregion can be simplified to its "new value" input.
+      if (RetTy
+          == ArgBegin[GenXIntrinsic::GenXRegion::NewValueOperandNum]->getType()) {
+        if (auto CMask = dyn_cast<Constant>(ArgBegin[
+              GenXIntrinsic::GenXRegion::PredicateOperandNum])) {
+          if (CMask->isAllOnesValue()) {
+            unsigned NumElements = RetTy->getVectorNumElements();
+            unsigned Width = cast<ConstantInt>(
+                  ArgBegin[GenXIntrinsic::GenXRegion::WrWidthOperandNum])
+                ->getZExtValue();
+            auto IndexV = dyn_cast<Constant>(
+              ArgBegin[GenXIntrinsic::GenXRegion::WrIndexOperandNum]);
+            if (!IndexV)
+              return nullptr;
+            unsigned Index = 0;
+            if (!isa<VectorType>(IndexV->getType()))
+              Index = dyn_cast<ConstantInt>(IndexV)->getZExtValue()
+              / (RetTy->getScalarType()->getPrimitiveSizeInBits() / 8);
+            else
+              return nullptr;
+            if ((Index == 0 || Index >= NumElements) &&
+                (Width == NumElements || Width == cast<ConstantInt>(ArgBegin[
+                 GenXIntrinsic::GenXRegion::WrVStrideOperandNum])->getSExtValue()))
+              if (NumElements == 1 || cast<ConstantInt>(ArgBegin[
+                    GenXIntrinsic::GenXRegion::WrStrideOperandNum])->getSExtValue())
+                return ArgBegin[GenXIntrinsic::GenXRegion::NewValueOperandNum];
+          }
+        }
+      }
+      // Wrregion with constant 0 predicate can be simplified to its "old value"
+      // input.
+      if (auto CMask = dyn_cast<Constant>(ArgBegin[
+            GenXIntrinsic::GenXRegion::PredicateOperandNum]))
+        if (CMask->isNullValue())
+          return ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum];
+      // Wrregion writing a value that has just been read out of the same
+      // region in the same vector can be simplified to its "old value" input.
+      // This works even if the predicate is not all true.
+      if (auto RdR = dyn_cast<CallInst>(ArgBegin[
+            GenXIntrinsic::GenXRegion::NewValueOperandNum])) {
+        if (auto RdRFunc = RdR->getCalledFunction()) {
+          Value *OldVal = ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum];
+          if ((GenXIntrinsic::getGenXIntrinsicID(RdRFunc) ==
+                   GenXIntrinsic::genx_rdregioni ||
+               GenXIntrinsic::getGenXIntrinsicID(RdRFunc) ==
+                   GenXIntrinsic::genx_rdregionf) &&
+              RdR->getArgOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)
+                == OldVal) {
+            // Check the region parameters match between the rdregion and
+            // wrregion. There are 4 region parameters: vstride, width, stride,
+            // index.
+            bool CanSimplify = true;
+            for (unsigned i = 0; i != 4; ++i) {
+              if (ArgBegin[GenXIntrinsic::GenXRegion::WrVStrideOperandNum + i]
+                  != RdR->getArgOperand(
+                    GenXIntrinsic::GenXRegion::RdVStrideOperandNum + i)) {
+                CanSimplify = false;
+                break;
+              }
+            }
+            if (CanSimplify)
+              return OldVal;
+          }
+        }
+      }
+      break;
+    case GenXIntrinsic::genx_wrpredregion:
+      // wrpredregion with undef "new value" input is simplified to the "old
+      // value" input.
+      if (isa<UndefValue>(ArgBegin[1]))
+        return ArgBegin[0];
+      break;
+  }
+  return nullptr;
+}
+
+/***********************************************************************
+ * SimplifyGenX : given a GenX related instruction, see if we can fold
+ * the result.
+ *
+ * ConstantFoldingGenX.cpp handles pure constant folding cases. This code
+ * also handles cases where not all operands are constant.
+ *
+ * If this instruction could not be simplified, returns null.
+ */
+Value *llvm::SimplifyGenX(CallInst *I) {
+  CallSite CS{I};
+  Value *V = CS.getCalledValue();
+  Type *Ty = V->getType();
+  if (auto *PTy = dyn_cast<PointerType>(Ty))
+    Ty = PTy->getElementType();
+  auto *FTy = cast<FunctionType>(Ty);
+  auto *F = dyn_cast<Function>(V);
+  if (!F)
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "Trying to simplify " << *I << "\n");
+  auto GenXID = GenXIntrinsic::getGenXIntrinsicID(F);
+  if (Value *Ret = SimplifyGenXIntrinsic(GenXID, FTy->getReturnType(),
+                                         CS.arg_begin(), CS.arg_end())) {
+    LLVM_DEBUG(dbgs() << "Simplified to " << *Ret << "\n");
+    return Ret;
+  }
+
+  LLVM_DEBUG(dbgs() << "Failed to simplify, trying to constant fold\n");
+  Constant *C = ConstantFoldGenX(I, I->getModule()->getDataLayout());
+  if (C)
+    LLVM_DEBUG(dbgs() << "Successfully folded to " << *C << "\n");
+  else
+    LLVM_DEBUG(dbgs() << "Failed to constant fold instruction\n");
+  return C;
+}
+
+namespace llvm {
+void initializeGenXSimplifyPass(PassRegistry &);
+}
+
+namespace {
+class GenXSimplify : public FunctionPass {
+public:
+  static char ID;
+
+  GenXSimplify() : FunctionPass(ID) {
+    initializeGenXSimplifyPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+bool GenXSimplify::runOnFunction(Function &F) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  bool Changed = false;
+  for (auto &BB : F) {
+    for (auto I = BB.begin(); I != BB.end();) {
+      Instruction *Inst = &*I++;
+      if (auto *CI = dyn_cast<CallInst>(Inst)) {
+        if (GenXIntrinsic::isGenXIntrinsic(CI)) {
+          if (Value *V = SimplifyGenX(CI)) {
+            CI->replaceAllUsesWith(V);
+            CI->eraseFromParent();
+            Changed = true;
+          }
+          continue;
+        }
+      }
+
+      if (Value *V = SimplifyInstruction(Inst, DL)) {
+        Inst->replaceAllUsesWith(V);
+        Inst->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+char GenXSimplify::ID = 0;
+INITIALIZE_PASS(GenXSimplify, "genx-simplify",
+                "simplify genx specific instructions", false, false)
+
+FunctionPass *llvm::createGenXSimplifyPass() { return new GenXSimplify; }
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/GenXPacketize.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/GenXPacketize.cpp
new file mode 100644
index 000000000000..50a49fba17df
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/GenXPacketize.cpp
@@ -0,0 +1,1757 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+/// GenXPacketize
+/// -------------
+///
+///   - Vectorize the SIMT functions
+///
+///   - Vectorize the generic function called by the SIMT functions
+///
+///   - Replace generic control-flow with SIMD control-flow
+///
+//===----------------------------------------------------------------------===//
+
+#include "PacketBuilder.h"
+
+#include "llvmWrapper/Support/Alignment.h"
+
+#include "vc/GenXOpts/Utils/CMRegion.h"
+
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/GenXIntrinsics/GenXSimdCFLowering.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include <algorithm>
+#include <set>
+#include <stack>
+#include <unordered_set>
+
+using namespace pktz;
+
+namespace llvm {
+
+/// Packetizing SIMT functions
+/// ^^^^^^^^^^^^^^^^^^^^^^^^^^
+///
+/// a) Look for functions with attributes CMGenXSIMT
+///    If no such function, end the pass
+///
+/// b) sort functions in call-graph topological order
+///    find those generic functions called by the SIMT functions
+///    find all the possible widthes those functions should be vectorized to
+///
+/// c) find those uniform function arguments
+///    arguments for non-SIMT functions are uniform
+///    arguments for SIMT-entry are uniform
+///    arguments for SIMT-functions are uniform if it is only defined by
+///       callers' uniform argument.
+///
+/// d) Run reg2mem pass to remove phi-nodes
+///    This is because we need to generate simd-control-flow
+///    after packetization. simd-control-flow lowering cannot handle phi-node.
+///
+/// e) for uniform arguments
+///    Mark the allocas for those arguments as uniform
+///    Mark the load/store for those allocas as uniform
+///
+/// f) vectorize generic functions to its SIMT width, callee first
+///    - create the vector prototype
+///    - clone the function-body into the vector prototype
+///    - vectorize the function-body
+///    - note: original function is kept because it may be used outside SIMT
+///
+/// g) vectorize SIMT-entry functions
+///    - no change of function arguments
+///    - no cloning, direct-vectorization on the function-body
+///
+/// h) SIMD-control-flow lowering
+///
+/// i) run mem2reg pass to create SSA
+///
+/// j) CMABI pass to remove global Execution-Mask
+///
+class GenXPacketize : public ModulePass {
+public:
+  static char ID;
+  explicit GenXPacketize() : ModulePass(ID) {}
+  ~GenXPacketize() { releaseMemory(); }
+  virtual StringRef getPassName() const override { return "GenX Packetize"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(BreakCriticalEdgesID);
+  };
+  bool runOnModule(Module &M) override;
+  void releaseMemory() override {
+    ReplaceMap.clear();
+    UniformArgs.clear();
+    UniformInsts.clear();
+    FuncOrder.clear();
+    FuncVectors.clear();
+    FuncMap.clear();
+  }
+
+private:
+  void findFunctionVectorizationOrder(Module *M);
+
+  Value *getPacketizeValue(Value *OrigValue);
+  Value *getUniformValue(Value *OrigValue);
+  Function *getVectorIntrinsic(Module *M, unsigned id, std::vector<Type *> &ArgTy);
+  Value *packetizeConstant(Constant *pConstant);
+  Value *packetizeGenXIntrinsic(Instruction *pInst);
+  Value *packetizeLLVMIntrinsic(Instruction *pInst);
+  Value *packetizeLLVMInstruction(Instruction *pInst);
+  Value *packetizeInstruction(Instruction *pInst);
+
+  void replaceAllUsesNoTypeCheck(Value *pInst, Value *pNewInst);
+  void removeDeadInstructions(Function &F);
+  void fixupLLVMIntrinsics(Function &F);
+
+  Function *vectorizeSIMTFunction(Function *F, unsigned Width);
+  bool vectorizeSIMTEntry(Function &F);
+
+  bool isUniformIntrinsic(unsigned id);
+  void findUniformArgs(Function &F);
+  void findUniformInsts(Function &F);
+
+  void lowerControlFlowAfter(std::vector<Function *> &SIMTFuncs);
+  GlobalVariable *findGlobalExecMask();
+
+private:
+  Module *M;
+  PacketBuilder *B;
+
+  // track already packetized values
+  ValueToValueMapTy ReplaceMap;
+
+  /// uniform set for arguments
+  std::set<const Argument *> UniformArgs;
+  /// uniform set for alloca, load, store, and GEP
+  std::set<const Instruction *> UniformInsts;
+  /// sort function in caller-first order
+  std::vector<Function *> FuncOrder;
+  /// map: function ==> a set of vectorization width
+  std::map<Function *, std::set<unsigned>> FuncVectors;
+  /// Map: original function and vectorization width ==> vectorized version
+  std::map<std::pair<Function *, unsigned>, Function *> FuncMap;
+
+  const DataLayout *DL;
+};
+
+bool GenXPacketize::runOnModule(Module &Module) {
+  M = &Module;
+  // find all the SIMT enntry-functions
+  std::vector<Function *> ForkFuncs;
+  for (auto &F : M->getFunctionList()) {
+    if (F.hasFnAttribute("CMGenxSIMT")) {
+      uint32_t Width = 0;
+      F.getFnAttribute("CMGenxSIMT").getValueAsString().getAsInteger(0, Width);
+      if (Width > 1) {
+        assert(Width == 8 || Width == 16 || Width == 32);
+        ForkFuncs.push_back(&F);
+      }
+    }
+  }
+  if (ForkFuncs.empty())
+    return false;
+
+  // sort functions in order, also find those functions that are used in
+  // the SIMT mode, therefore need whole-function vectorization.
+  findFunctionVectorizationOrder(M);
+
+  unsigned NumFunc = FuncOrder.size();
+  // find uniform arguments
+  UniformArgs.clear();
+  for (unsigned i = 0; i < NumFunc; ++i) {
+    auto F = FuncOrder[i];
+    findUniformArgs(*F);
+  }
+
+  // perform reg-to-mem to remove phi before packetization
+  // because we need to generate simd-control-flow after packetization
+  // we then perform mem-to-reg after generating simd-control-flow.
+  std::unique_ptr<FunctionPass> DemotePass(createDemoteRegisterToMemoryPass());
+  for (auto &F : M->getFunctionList()) {
+    DemotePass->runOnFunction(F);
+  }
+
+  UniformInsts.clear();
+
+  DL = &(M->getDataLayout());
+  B = new PacketBuilder(M);
+  std::vector<Function *> SIMTFuncs;
+  // Process those functions called in the SIMT mode
+  for (int i = NumFunc - 1; i >= 0; --i) {
+    auto F = FuncOrder[i];
+    auto iter = FuncVectors.find(F);
+    if (iter != FuncVectors.end()) {
+      auto WV = iter->second;
+      for (auto W : WV) {
+        auto VF = vectorizeSIMTFunction(F, W);
+        auto Key = std::pair<Function *, unsigned>(F, W);
+        FuncMap.insert(
+            std::pair<std::pair<Function *, unsigned>, Function *>(Key, VF));
+        SIMTFuncs.push_back(VF);
+      }
+    }
+  }
+
+  // vectorize SIMT entry-functions
+  bool Modified = false;
+  for (auto F : ForkFuncs) {
+    Modified |= vectorizeSIMTEntry(*F);
+    SIMTFuncs.push_back(&(*F));
+  }
+
+  delete B;
+
+  // lower the SIMD control-flow
+  lowerControlFlowAfter(SIMTFuncs);
+
+  return Modified;
+}
+
+/***************************************************************************
+ * vectorize a functions that is used in the fork-region
+ */
+Function *GenXPacketize::vectorizeSIMTFunction(Function *F, unsigned Width) {
+  assert(!F->hasFnAttribute("CMGenxSIMT"));
+  B->SetTargetWidth(Width);
+
+  // vectorize the argument and return types
+  std::vector<Type *> ArgTypes;
+  for (const Argument &I : F->args()) {
+    if (UniformArgs.count(&I))
+      ArgTypes.push_back(I.getType());
+    else if (I.getType()->isPointerTy()) {
+      // FIXME: check the pointer defined by an argument or an alloca
+      // [N x float]* should packetize to [N x <8 x float>]*
+      auto VTy = PointerType::get(
+          B->GetVectorType(I.getType()->getPointerElementType()),
+          I.getType()->getPointerAddressSpace());
+      ArgTypes.push_back(VTy);
+    } else {
+      ArgTypes.push_back(B->GetVectorType(I.getType()));
+    }
+  }
+  Type *RetTy = B->GetVectorType(F->getReturnType());
+  // Create a new function type...
+  assert(!F->isVarArg());
+  FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, false);
+
+  // Create the vector function prototype
+  StringRef VecFName = F->getName();
+  const char *Suffix[] = {".vec00", ".vec08", ".vec16", ".vec24", ".vec32"};
+  Function *ClonedFunc =
+      Function::Create(FTy, GlobalValue::InternalLinkage,
+                       VecFName + Suffix[Width / 8], F->getParent());
+  ClonedFunc->setCallingConv(F->getCallingConv());
+  ClonedFunc->setAttributes(F->getAttributes());
+  ClonedFunc->setAlignment(IGCLLVM::getAlign(F->getAlignment()));
+
+  // then use CloneFunctionInto
+  ValueToValueMapTy ArgMap;
+  Function::arg_iterator ArgI = ClonedFunc->arg_begin();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I) {
+    ArgI->setName(I->getName()); // Copy the name over...
+    ArgMap[I] = ArgI;            // Add mapping to ValueMap
+    if (UniformArgs.count(I)) {  // bookkeep the uniform set
+      UniformArgs.insert(ArgI);
+    }
+    ArgI++;
+  }
+  SmallVector<ReturnInst *, 10> returns;
+  ClonedCodeInfo CloneInfo;
+  CloneFunctionInto(ClonedFunc, F, ArgMap, true, returns, Suffix[Width / 8],
+                    &CloneInfo);
+
+  ReplaceMap.clear();
+  // find uniform instructions related to uniform arguments
+  findUniformInsts(*ClonedFunc);
+
+  // vectorize instructions in the fork-regions
+  for (auto I = ClonedFunc->begin(), E = ClonedFunc->end(); I != E; ++I) {
+    BasicBlock *BB = &*I;
+    for (auto &I : BB->getInstList()) {
+      if (!UniformInsts.count(&I)) {
+        Value *pPacketizedInst = packetizeInstruction(&I);
+        ReplaceMap[&I] = pPacketizedInst;
+      } else {
+        for (int i = 0, n = I.getNumOperands(); i < n; ++i) {
+          Value *OrigValue = I.getOperand(i);
+          auto iter = ReplaceMap.find(OrigValue);
+          if (iter != ReplaceMap.end() && iter->second != OrigValue) {
+            I.setOperand(i, iter->second);
+          }
+        }
+      }
+    }
+  }
+
+  removeDeadInstructions(*ClonedFunc);
+
+  return ClonedFunc;
+}
+
+/***************************************************************************
+ * vectorize a SIMT-entry function
+ */
+bool GenXPacketize::vectorizeSIMTEntry(Function &F) {
+  assert(F.hasFnAttribute("CMGenxSIMT"));
+
+  // find uniform instructions related to uniform arguments
+  findUniformInsts(F);
+
+  uint32_t Width = 0;
+  F.getFnAttribute("CMGenxSIMT").getValueAsString().getAsInteger(0, Width);
+
+  B->SetTargetWidth(Width);
+
+  ReplaceMap.clear();
+
+  B->IRB()->SetInsertPoint(&F.getEntryBlock(), F.getEntryBlock().begin());
+
+  // vectorize instructions in the fork-regions
+  for (auto I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *BB = &*I;
+    for (auto &I : BB->getInstList()) {
+      if (!UniformInsts.count(&I)) {
+        Value *pPacketizedInst = packetizeInstruction(&I);
+        ReplaceMap[&I] = pPacketizedInst;
+      } else {
+        for (int i = 0, n = I.getNumOperands(); i < n; ++i) {
+          Value *OrigValue = I.getOperand(i);
+          auto iter = ReplaceMap.find(OrigValue);
+          if (iter != ReplaceMap.end() && iter->second != OrigValue) {
+            I.setOperand(i, iter->second);
+          }
+        }
+      }
+    }
+  }
+
+  removeDeadInstructions(F);
+  // a SIMT entry is always inlined after vectorization
+  // This is required in order to handle structure argument,
+  // for example, generated from lambda capture.
+  if (F.hasFnAttribute(Attribute::NoInline))
+    F.removeFnAttr(Attribute::NoInline);
+  F.addFnAttr(Attribute::AlwaysInline);
+  F.removeFnAttr("CMGenxSIMT");
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  return true;
+}
+
+/************************************************************************
+ * findFunctionVectorizationOrder : calculate the order we want to visit
+ * functions, such that a function is not visited until all its callees
+ * have been visited. Also if a function is called directly or indirectly
+ * in the SIMT mode, add it to the list that need vectorization
+ */
+// Call graph node
+struct CGNode {
+  Function *F;
+  std::set<CGNode *> UnvisitedCallers;
+  std::set<CGNode *> Callees;
+};
+
+void GenXPacketize::findFunctionVectorizationOrder(Module *M) {
+  // First build the call graph.
+  // We roll our own call graph here, because it is simpler than the general
+  // case supported by LLVM's call graph analysis (CM does not support
+  // recursion or function pointers), and we want to modify it (using the
+  // UnvisitedCallers set) when we traverse it.
+  std::map<Function *, CGNode> CallGraph;
+  for (auto mi = M->begin(), me = M->end(); mi != me; ++mi) {
+    Function *F = &*mi;
+    if (F->empty())
+      continue;
+
+    fixupLLVMIntrinsics(*F);
+
+    // For each defined function: for each use (a call), add it to our
+    // UnvisitedCallers set, and add us to its Callees set.
+    // We are ignoring an illegal non-call use of a function; someone
+    // else can spot and diagnose that later.
+    // If the function has no callers, then add it straight in to FuncOrder.
+    CGNode *CGN = &CallGraph[F];
+    CGN->F = F;
+    if (F->use_empty()) {
+      FuncOrder.push_back(F);
+      continue;
+    }
+    for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) {
+      if (auto CI = dyn_cast<CallInst>(ui->getUser())) {
+        BasicBlock *Blk = CI->getParent();
+        Function *Caller = Blk->getParent();
+        CGNode *CallerNode = &CallGraph[Caller];
+        CallerNode->F = Caller;
+        CGN->UnvisitedCallers.insert(CallerNode);
+        CallerNode->Callees.insert(CGN);
+        // find the vectorization width of callee
+        auto CallerVectorIter = FuncVectors.find(Caller);
+        if (CallerVectorIter != FuncVectors.end()) {
+          auto CalleeVectorIter = FuncVectors.find(F);
+          if (CalleeVectorIter != FuncVectors.end())
+            CalleeVectorIter->second.insert(CallerVectorIter->second.begin(),
+                                            CallerVectorIter->second.end());
+          else
+            FuncVectors.insert(std::pair<Function *, std::set<unsigned>>(
+                F, CallerVectorIter->second));
+        } else if (Caller->hasFnAttribute("CMGenxSIMT")) {
+          uint32_t width = 0;
+          Caller->getFnAttribute("CMGenxSIMT")
+              .getValueAsString()
+              .getAsInteger(0, width);
+          if (width > 1) {
+            auto CalleeVectorIter = FuncVectors.find(F);
+            if (CalleeVectorIter != FuncVectors.end())
+              CalleeVectorIter->second.insert(width);
+            else {
+              std::set<unsigned> WidthSet;
+              WidthSet.insert(width);
+              FuncVectors.insert(
+                  std::pair<Function *, std::set<unsigned>>(F, WidthSet));
+            }
+          }
+        }
+      }
+    }
+  }
+  // Run through the visit order. For each function, remove it from each
+  // callee's UnvisitedCallers set, and, if now empty, add the callee to
+  // the end of the visit order.
+  for (unsigned i = 0; i != FuncOrder.size(); ++i) {
+    CGNode *CGN = &CallGraph[FuncOrder[i]];
+    for (auto ci = CGN->Callees.begin(), ce = CGN->Callees.end(); ci != ce;
+         ++ci) {
+      CGNode *Callee = *ci;
+      Callee->UnvisitedCallers.erase(CGN);
+      if (Callee->UnvisitedCallers.empty())
+        FuncOrder.push_back(Callee->F);
+      // find the vectorization width of callee
+      auto CallerVectorIter = FuncVectors.find(CGN->F);
+      if (CallerVectorIter != FuncVectors.end()) {
+        auto CalleeVectorIter = FuncVectors.find(Callee->F);
+        if (CalleeVectorIter != FuncVectors.end())
+          CalleeVectorIter->second.insert(CallerVectorIter->second.begin(),
+                                          CallerVectorIter->second.end());
+        else
+          FuncVectors.insert(
+              std::make_pair(Callee->F, CallerVectorIter->second));
+      }
+    }
+  }
+}
+
+void GenXPacketize::findUniformArgs(Function &F) {
+  auto iter = FuncVectors.find(&F);
+  if (iter == FuncVectors.end()) {
+    // non-simt function or simt-entry function
+    for (const Argument &I : F.args())
+      UniformArgs.insert(&I);
+  } else {
+    // simt functions that needs whole-function vectorization
+    for (const Argument &I : F.args()) {
+      bool IsUniform = true;
+      // check every call-site
+      for (User *U : F.users()) {
+        if (CallInst *CI = dyn_cast<CallInst>(U)) {
+          auto Def = CI->getArgOperand(I.getArgNo());
+          if (Argument *DA = dyn_cast<Argument>(Def)) {
+            if (!UniformArgs.count(DA)) {
+              IsUniform = false;
+              break;
+            }
+          } else {
+            IsUniform = false;
+            break;
+          }
+        } else {
+          IsUniform = false;
+          break;
+        }
+      }
+      if (IsUniform)
+        UniformArgs.insert(&I);
+    }
+  }
+}
+
+bool GenXPacketize::isUniformIntrinsic(unsigned id) {
+  switch (id) {
+  case GenXIntrinsic::genx_get_color:
+  case GenXIntrinsic::genx_get_hwid:
+  case GenXIntrinsic::genx_get_scoreboard_bti:
+  case GenXIntrinsic::genx_get_scoreboard_deltas:
+  case GenXIntrinsic::genx_get_scoreboard_depcnt:
+  case GenXIntrinsic::genx_local_id:
+  case GenXIntrinsic::genx_local_id16:
+  case GenXIntrinsic::genx_local_size:
+  case GenXIntrinsic::genx_group_count:
+  case GenXIntrinsic::genx_group_id_x:
+  case GenXIntrinsic::genx_group_id_y:
+  case GenXIntrinsic::genx_group_id_z:
+  case GenXIntrinsic::genx_predefined_surface:
+  case GenXIntrinsic::genx_barrier:
+  case GenXIntrinsic::genx_sbarrier:
+  case GenXIntrinsic::genx_cache_flush:
+  case GenXIntrinsic::genx_fence:
+  case GenXIntrinsic::genx_wait:
+  case GenXIntrinsic::genx_yield:
+  case GenXIntrinsic::genx_print_buffer:
+  case GenXIntrinsic::genx_r0:
+  case GenXIntrinsic::genx_sr0:
+  case GenXIntrinsic::genx_timestamp:
+  case GenXIntrinsic::genx_thread_x:
+  case GenXIntrinsic::genx_thread_y:
+      return true;
+  default:
+    break;
+  }
+  return false;
+}
+
+void GenXPacketize::findUniformInsts(Function &F) {
+  // global variable load is uniform
+  for (auto &Global : M->getGlobalList()) {
+    for (auto UI = Global.use_begin(), UE = Global.use_end(); UI != UE; ++UI) {
+      if (auto LD = dyn_cast<LoadInst>(UI->getUser())) {
+        UniformInsts.insert(LD);
+      }
+    }
+  }
+  // some intrinsics are always uniform
+  for (auto &FD : M->getFunctionList()) {
+    if (FD.isDeclaration()) {
+      if (isUniformIntrinsic(GenXIntrinsic::getGenXIntrinsicID(&FD))) {
+        for (auto UI = FD.use_begin(), UE = FD.use_end(); UI != UE; ++UI) {
+          if (auto Inst = dyn_cast<Instruction>(UI->getUser())) {
+            UniformInsts.insert(Inst);
+          }
+        }
+      }
+    }
+  }
+  // first find out all the uniform alloca to store those uniform arguments
+  std::stack<Value *> uvset;
+  for (const Argument &I : F.args()) {
+    if (!UniformArgs.count(&I))
+      continue;
+    for (auto UI = I.user_begin(), E = I.user_end(); UI != E; ++UI) {
+      const Value *use = (*UI);
+      if (auto LI = dyn_cast<LoadInst>(use)) {
+        UniformInsts.insert(LI);
+      } else if (auto GEP = dyn_cast<GetElementPtrInst>(use)) {
+        if (GEP->getPointerOperand() == &I) {
+          UniformInsts.insert(GEP);
+          uvset.push((Value *)GEP);
+        }
+      } else if (auto SI = dyn_cast<StoreInst>(use)) {
+        if (SI->getPointerOperand() == &I)
+          UniformInsts.insert(SI);
+        else {
+          auto PI = SI->getPointerOperand();
+          if (auto AI = dyn_cast<AllocaInst>(PI)) {
+            UniformInsts.insert(AI);
+            uvset.push((Value *)AI);
+          }
+        }
+      } else if (auto CI = dyn_cast<CallInst>(use)) {
+        if (Function *Callee = CI->getCalledFunction()) {
+          if (GenXIntrinsic::isVLoadStore(Callee)) {
+            UniformInsts.insert(CI);
+          }
+        }
+      }
+    }
+  }
+
+  // then find the uniform loads and stores in fork-region
+  while (!uvset.empty()) {
+    Value *Def = uvset.top();
+    uvset.pop();
+    for (auto UI = Def->user_begin(), E = Def->user_end(); UI != E; ++UI) {
+      Value *use = (*UI);
+      if (auto UseI = dyn_cast<Instruction>(use)) {
+        if (isa<StoreInst>(UseI)) {
+          UniformInsts.insert(UseI);
+        } else if (auto LI = dyn_cast<LoadInst>(UseI)) {
+          UniformInsts.insert(UseI);
+          if (LI->getType()->isPointerTy())
+            uvset.push(UseI);
+        } else if (auto GEP = dyn_cast<GetElementPtrInst>(UseI)) {
+          if (GEP->hasAllConstantIndices()) {
+            uvset.push(UseI);
+            UniformInsts.insert(UseI);
+          }
+        }
+      }
+    }
+  }
+  return;
+}
+
+Value *GenXPacketize::getPacketizeValue(Value *OrigValue) {
+  auto iter = ReplaceMap.find(OrigValue);
+  if (iter != ReplaceMap.end()) {
+    return iter->second;
+  } else if (auto C = dyn_cast<Constant>(OrigValue)) {
+    return packetizeConstant(C);
+  } else if (auto A = dyn_cast<Argument>(OrigValue)) {
+    if (UniformArgs.count(A))
+      return B->VBROADCAST(OrigValue, OrigValue->getName());
+    // otherwise the argument should have been in the right vector form
+    ReplaceMap[OrigValue] = OrigValue;
+    return OrigValue;
+  } else if (auto Inst = dyn_cast<Instruction>(OrigValue)) {
+    // need special handling for alloca
+    if (auto AI = dyn_cast<AllocaInst>(OrigValue)) {
+      // this is not a uniform alloca
+      if (!UniformInsts.count(Inst)) {
+        Type *VecType = B->GetVectorType(AI->getAllocatedType());
+        auto V = B->ALLOCA(VecType, nullptr, AI->getName());
+        V->removeFromParent();
+        V->insertBefore(Inst);
+        ReplaceMap[OrigValue] = V;
+        return V;
+      }
+      ReplaceMap[OrigValue] = OrigValue;
+      return OrigValue;
+    } else if (UniformInsts.count(Inst)) {
+      auto V = B->VBROADCAST(OrigValue);
+      return V;
+    }
+  }
+
+  report_fatal_error("Could not find packetized value!");
+
+  return nullptr;
+}
+
+// this is used on operands that are expected to be uniform
+Value *GenXPacketize::getUniformValue(Value *OrigValue) {
+  if (auto G = dyn_cast<GlobalValue>(OrigValue))
+    return G;
+  if (auto C = dyn_cast<Constant>(OrigValue))
+    return C;
+  if (auto A = dyn_cast<Argument>(OrigValue)) {
+    if (UniformArgs.count(A)) {
+      return A;
+    }
+  }
+  if (auto A = dyn_cast<Instruction>(OrigValue)) {
+    if (UniformInsts.count(A)) {
+      return A;
+    }
+  }
+  auto VV = getPacketizeValue(OrigValue);
+  return B->VEXTRACT(VV, (uint64_t)0, OrigValue->getName());
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns the equivalent vector intrinsic for the input scalar
+/// intrinsic
+Function *GenXPacketize::getVectorIntrinsic(Module *M, unsigned id,
+  std::vector<Type *> &ArgTy)
+{
+  if (id == Intrinsic::fma) {
+    return Intrinsic::getDeclaration(M, (Intrinsic::ID)id, ArgTy[0]);
+  } else if (id == Intrinsic::pow) {
+    // for some reason, passing the 2 vector input args to the pow declaration
+    // results in a malformed vectored pow intrinsic. Forcing the expected
+    // vector input here.
+    return Intrinsic::getDeclaration(M, (Intrinsic::ID)id, B->mSimdFP32Ty);
+  } else if ((id == Intrinsic::maxnum) || (id == Intrinsic::minnum)) {
+    return Intrinsic::getDeclaration(M, (Intrinsic::ID)id, ArgTy[0]);
+  } else {
+    return GenXIntrinsic::getAnyDeclaration(M, id, ArgTy);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Determines if instruction is an llvm intrinsic (which may include
+///        x86 intrinsics
+static bool IsLLVMIntrinsic(Instruction *pInst) {
+  if (isa<CallInst>(pInst)) {
+    CallInst *call = cast<CallInst>(pInst);
+    Function *f = call->getCalledFunction();
+    assert(f);
+    return f->isIntrinsic();
+  }
+  return false;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Packetize a scalar constant
+Value *GenXPacketize::packetizeConstant(Constant *pConstant) {
+  if (isa<UndefValue>(pConstant)) {
+    return UndefValue::get(B->GetVectorType(pConstant->getType()));
+  } else {
+    return B->VBROADCAST(pConstant);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Packetize an LLVM intrinsic.  Generally this means replacing
+///        a scalar intrinsic function call with a vectored equivalent.
+Value *GenXPacketize::packetizeLLVMIntrinsic(Instruction *pInst) {
+  Module *M = B->mpModule;
+
+  B->IRB()->SetInsertPoint(pInst);
+  CallInst *pCall = cast<CallInst>(pInst);
+  Function *f = pCall->getCalledFunction();
+  assert(f && f->isIntrinsic());
+  auto id = GenXIntrinsic::getAnyIntrinsicID(f);
+
+  // packetize intrinsic operands
+  std::vector<Type *> vectorArgTys;
+  std::vector<Value *> packetizedArgs;
+  for (auto &operand : pCall->arg_operands()) {
+    auto VV = getPacketizeValue(operand.get());
+    packetizedArgs.push_back(VV);
+    vectorArgTys.push_back(VV->getType());
+  }
+
+  // override certain intrinsics
+  Value *pNewCall;
+  switch (id) {
+  case Intrinsic::log2:
+    pNewCall = B->VLOG2PS(packetizedArgs[0]);
+    break;
+  case Intrinsic::exp2:
+    pNewCall = B->VEXP2PS(packetizedArgs[0]);
+    break;
+  default: {
+    Function *newF = getVectorIntrinsic(M, id, vectorArgTys);
+    pNewCall = CallInst::Create(newF, packetizedArgs, "", pCall);
+  }
+  }
+  return pNewCall;
+}
+
+Value *GenXPacketize::packetizeLLVMInstruction(Instruction *pInst) {
+  Value *pReplacedInst = nullptr;
+  B->IRB()->SetInsertPoint(pInst);
+  // packetize a call
+  if (auto CI = dyn_cast<CallInst>(pInst)) {
+    auto F = CI->getCalledFunction();
+    auto FMI = FuncMap.find(std::pair<Function *, unsigned>(F, B->mVWidth));
+    if (FMI != FuncMap.end()) {
+      std::vector<Value *> ArgOps;
+      auto VF = FMI->second;
+      for (Argument &Arg : VF->args()) {
+        auto i = Arg.getArgNo();
+        if (UniformArgs.count(&Arg))
+          ArgOps.push_back(getUniformValue(CI->getArgOperand(i)));
+        else
+          ArgOps.push_back(getPacketizeValue(CI->getArgOperand(i)));
+      }
+      pReplacedInst = CallInst::Create(VF, ArgOps, CI->getName(), CI);
+      return pReplacedInst;
+    } else
+      assert(false);
+  }
+  uint32_t opcode = pInst->getOpcode();
+
+  switch (opcode) {
+  case Instruction::AddrSpaceCast:
+  case Instruction::BitCast: {
+    // packetize the bitcast source
+    Value *pPacketizedSrc = getPacketizeValue(pInst->getOperand(0));
+    Type *pPacketizedSrcTy = pPacketizedSrc->getType();
+
+    // packetize dst type
+    Type *pReturnTy;
+    if (pInst->getType()->isPointerTy()) {
+      // two types of pointers, <N x Ty>* or <N x Ty*>
+      Type *pDstScalarTy = pInst->getType()->getPointerElementType();
+
+      if (pPacketizedSrc->getType()->isVectorTy()) {
+        // <N x Ty*>
+        Type *pDstPtrTy = PointerType::get(
+            pDstScalarTy, pInst->getType()->getPointerAddressSpace());
+        uint32_t numElems = pPacketizedSrcTy->getVectorNumElements();
+        pReturnTy = VectorType::get(pDstPtrTy, numElems);
+      } else {
+        // <N x Ty>*
+        pReturnTy =
+            PointerType::get(B->GetVectorType(pDstScalarTy),
+                             pInst->getType()->getPointerAddressSpace());
+      }
+    } else {
+      pReturnTy = B->GetVectorType(pInst->getType());
+    }
+
+    pReplacedInst =
+        B->CAST((Instruction::CastOps)opcode, pPacketizedSrc, pReturnTy);
+    break;
+  }
+
+  case Instruction::GetElementPtr: {
+    GetElementPtrInst *pGepInst = cast<GetElementPtrInst>(pInst);
+    auto pBase = pGepInst->getPointerOperand();
+    Value *pVecSrc = nullptr;
+    if (dyn_cast<GlobalValue>(pBase))
+      pVecSrc = pBase;
+    else if (dyn_cast<Argument>(pBase))
+      pVecSrc = pBase;
+    else if (dyn_cast<Instruction>(pBase) &&
+             UniformInsts.count(dyn_cast<Instruction>(pBase)))
+      pVecSrc = pBase;
+    else
+      pVecSrc = getPacketizeValue(pBase);
+
+    if (!isa<AllocaInst>(pVecSrc)) {
+      // just packetize the GEP to a vector GEP.
+      SmallVector<Value *, 8> vecIndices;
+      for (uint32_t i = 0; i < pGepInst->getNumIndices(); ++i) {
+        vecIndices.push_back(getPacketizeValue(pGepInst->getOperand(1 + i)));
+      }
+      pReplacedInst = B->GEPA(pVecSrc, vecIndices);
+    } else {
+      if (pGepInst->hasAllConstantIndices()) {
+        // SOA GEP with scalar src and constant indices, result will be <N x
+        // Ty>* Ex. gep [4 x <8 x float>]*, 0, 0 --> <8 x float>*
+        SmallVector<Value *, 8> vecIndices;
+        for (uint32_t i = 0; i < pGepInst->getNumIndices(); ++i) {
+          vecIndices.push_back(pGepInst->getOperand(1 + i));
+        }
+        pReplacedInst = B->GEPA(pVecSrc, vecIndices);
+      } else {
+        //// SOA GEP with non-uniform indices. Need to vector GEP to each SIMD
+        /// lane.
+        /// Result will be <N x Ty*>
+        SmallVector<Value *, 8> vecIndices;
+        for (uint32_t i = 0; i < pGepInst->getNumIndices(); ++i) {
+          vecIndices.push_back(getPacketizeValue(pGepInst->getOperand(1 + i)));
+        }
+
+        // Step to the SIMD lane
+        if (B->mVWidth == 8) {
+          vecIndices.push_back(B->C({0, 1, 2, 3, 4, 5, 6, 7}));
+        } else if (B->mVWidth == 16) {
+          vecIndices.push_back(
+              B->C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}));
+        } else {
+          report_fatal_error("Unsupported SIMD width.");
+        }
+
+        pReplacedInst = B->GEPA(pVecSrc, vecIndices);
+      }
+    }
+    break;
+  }
+
+  case Instruction::Load: {
+    LoadInst *pLoadInst = cast<LoadInst>(pInst);
+    Value *pSrc = pLoadInst->getPointerOperand();
+    Value *pVecSrc = getPacketizeValue(pSrc);
+    auto LI = cast<LoadInst>(pInst);
+    if (pVecSrc == pSrc)
+      pReplacedInst = pInst;
+    else if (pVecSrc->getType()->isVectorTy()) {
+      assert(pVecSrc->getType()->getVectorElementType()->isPointerTy());
+      auto Align = LI->getAlignment();
+      pReplacedInst = B->MASKED_GATHER(pVecSrc, Align);
+    } else {
+      auto Align = LI->getAlignment();
+      pReplacedInst = B->ALIGNED_LOAD(pVecSrc, Align);
+    }
+    break;
+  }
+
+  case Instruction::Store: {
+    StoreInst *pStoreInst = cast<StoreInst>(pInst);
+    Value *pVecDstPtrs = getPacketizeValue(pStoreInst->getPointerOperand());
+    Value *pVecSrc = getPacketizeValue(pStoreInst->getOperand(0));
+    if (pVecDstPtrs->getType()->isVectorTy()) {
+      assert(pVecDstPtrs->getType()->getVectorElementType()->isPointerTy());
+      auto Align = cast<StoreInst>(pInst)->getAlignment();
+      pReplacedInst = B->MASKED_SCATTER(pVecSrc, pVecDstPtrs, Align);
+    } else {
+      pReplacedInst = B->STORE(pVecSrc, pVecDstPtrs);
+    }
+    break;
+  }
+
+  case Instruction::ExtractElement: {
+    auto OldVec = pInst->getOperand(0);
+    auto Vec = getPacketizeValue(OldVec);
+    auto Idx = pInst->getOperand(1);
+    auto N = OldVec->getType()->getVectorNumElements();
+    auto ElemType = pInst->getType();
+    auto VecDstTy = VectorType::get(ElemType, B->mVWidth);
+    // create an read-region
+    CMRegion R(VecDstTy);
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+      R.Offset = CI->getSExtValue() * ElemType->getPrimitiveSizeInBits() / 8;
+      R.Indirect = nullptr;
+    } else {
+      R.Offset = 0;
+      auto NBits = Idx->getType()->getIntegerBitWidth() / 8;
+      auto MulCType = IntegerType::getIntNTy(M->getContext(), NBits);
+      auto MulC =
+          ConstantInt::get(MulCType, ElemType->getPrimitiveSizeInBits() / 8);
+      R.Indirect = B->MUL(Idx, MulC);
+    }
+    R.NumElements = B->mVWidth;
+    R.Width = B->mVWidth;
+    R.Stride = N;
+    R.VStride = 0;
+    pReplacedInst =
+        R.createRdRegion(Vec, pInst->getName(), pInst /*InsertBefore*/,
+                         pInst->getDebugLoc(), true /*AllowScalar*/);
+    break;
+  }
+
+  case Instruction::InsertElement: {
+    auto OldVec = pInst->getOperand(0);
+    auto Vec = getPacketizeValue(OldVec);
+    auto ElmVec = getPacketizeValue(pInst->getOperand(1));
+    auto Idx = pInst->getOperand(2);
+    auto N = OldVec->getType()->getVectorNumElements();
+    auto ElemType = pInst->getOperand(1)->getType();
+    // create an write-region
+    CMRegion R(Vec->getType());
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+      R.Offset = CI->getSExtValue() * ElemType->getPrimitiveSizeInBits() / 8;
+      R.Indirect = nullptr;
+    } else {
+      R.Offset = 0;
+      auto NBits = Idx->getType()->getIntegerBitWidth() / 8;
+      auto MulCType = IntegerType::getIntNTy(M->getContext(), NBits);
+      auto MulC =
+          ConstantInt::get(MulCType, ElemType->getPrimitiveSizeInBits() / 8);
+      R.Indirect = B->MUL(Idx, MulC);
+    }
+    R.NumElements = B->mVWidth;
+    R.Width = B->mVWidth;
+    R.Stride = N;
+    R.VStride = 0;
+    pReplacedInst =
+        R.createWrRegion(Vec, ElmVec, pInst->getName(), pInst /*InsertBefore*/,
+                         pInst->getDebugLoc());
+    break;
+  }
+
+  case Instruction::Br: {
+    // any conditional branches with vectored conditions need to preceded with
+    // a genx_simdcf_any to ensure we branch iff all lanes are set
+    BranchInst *pBranch = cast<BranchInst>(pInst);
+    if (pBranch->isConditional()) {
+      Value *vCondition = getPacketizeValue(pBranch->getCondition());
+      llvm::Function *NewFn = GenXIntrinsic::getGenXDeclaration(
+        B->mpModule,
+        GenXIntrinsic::genx_simdcf_any,
+        vCondition->getType());
+      llvm::CallInst *NewTest = CallInst::Create(NewFn, vCondition, "", pInst);
+      NewTest->setName("exit.cond.mask.test");
+      pBranch->setCondition(NewTest);
+    }
+    pReplacedInst = pBranch;
+    break;
+  }
+
+  case Instruction::PHI: {
+    Type *vecType = B->GetVectorType(pInst->getType());
+    pInst->mutateType(vecType);
+    pReplacedInst = pInst;
+    break;
+  }
+
+  case Instruction::Alloca: {
+    AllocaInst *pAllocaInst = cast<AllocaInst>(pInst);
+    Type *pVecType = B->GetVectorType(pAllocaInst->getAllocatedType());
+    Value *pReturn = B->ALLOCA(pVecType, nullptr, pInst->getName());
+    pReplacedInst = pReturn;
+    break;
+  }
+
+  case Instruction::ShuffleVector: {
+    auto Src1 = pInst->getOperand(0);
+    auto Src2 = pInst->getOperand(1);
+    auto Mask = pInst->getOperand(2);
+    if (Src1->getType()->getVectorNumElements() == 1 &&
+        Mask->getType()->getVectorNumElements() == 1) {
+      if (cast<Constant>(Mask)->isAllOnesValue())
+        pReplacedInst = getPacketizeValue(Src2);
+      else
+        pReplacedInst = getPacketizeValue(Src1);
+    } else
+      report_fatal_error(
+          "ShuffleVector should've been replaced by Scalarizer.");
+    break;
+  }
+
+  case Instruction::IntToPtr: {
+    IntToPtrInst *pIntToPtrInst = cast<IntToPtrInst>(pInst);
+    Value *pVecSrc = getPacketizeValue(pInst->getOperand(0));
+    Type *pVecDestTy = VectorType::get(pIntToPtrInst->getDestTy(), B->mVWidth);
+    pReplacedInst = B->INT_TO_PTR(pVecSrc, pVecDestTy);
+    break;
+  }
+
+  case Instruction::Select: {
+    Value *pVecCond = getPacketizeValue(pInst->getOperand(0));
+    Value *pTrueSrc = getPacketizeValue(pInst->getOperand(1));
+    Value *pFalseSrc = getPacketizeValue(pInst->getOperand(2));
+
+    if (!pTrueSrc->getType()->isPointerTy()) {
+      // simple select packetization
+      pReplacedInst = B->SELECT(pVecCond, pTrueSrc, pFalseSrc);
+    } else {
+      // vector struct input, need to loop over components and build up new
+      // struct allocation
+      Value *pAlloca = B->ALLOCA(
+          B->GetVectorType(pInst->getType()->getPointerElementType()));
+      uint32_t numElems =
+          pInst->getType()->getPointerElementType()->getArrayNumElements();
+
+      for (uint32_t i = 0; i < numElems; ++i) {
+        Value *pTrueSrcElem = B->LOAD(pTrueSrc, {0, i});
+        Value *pFalseSrcElem = B->LOAD(pFalseSrc, {0, i});
+
+        // mask store true components
+        Value *pGep = B->GEP(pAlloca, {0, i});
+        B->MASKED_STORE(pTrueSrcElem, pGep, 4, pVecCond);
+
+        // store false components to inverted mask
+        B->MASKED_STORE(pFalseSrcElem, pGep, 4, B->NOT(pVecCond));
+      }
+      pReplacedInst = pAlloca;
+    }
+    break;
+  }
+
+  case Instruction::Ret: {
+    ReturnInst *pRet = cast<ReturnInst>(pInst);
+    if (pRet->getReturnValue() != nullptr) {
+      Value *pReturn = getPacketizeValue(pRet->getReturnValue());
+      ReturnInst *pNewRet = B->RET(pReturn);
+      pReplacedInst = pNewRet;
+    } else {
+      pReplacedInst = pInst;
+    }
+
+    break;
+  }
+
+  default: {
+    // for the rest of the instructions, vectorize the instruction type as
+    // well as its args
+    Type *vecType = B->GetVectorType(pInst->getType());
+    pInst->mutateType(vecType);
+
+    for (Use &op : pInst->operands()) {
+      op.set(getPacketizeValue(op.get()));
+    }
+    pReplacedInst = pInst;
+  }
+  }
+
+  return pReplacedInst;
+}
+
+Value *GenXPacketize::packetizeGenXIntrinsic(Instruction *inst) {
+  B->IRB()->SetInsertPoint(inst);
+
+  if (auto CI = dyn_cast_or_null<CallInst>(inst)) {
+    if (Function *Callee = CI->getCalledFunction()) {
+      auto IID = GenXIntrinsic::getGenXIntrinsicID(Callee);
+      Value *replacement = nullptr;
+      // some intrinsics are uniform therefore should not get here
+      assert(!isUniformIntrinsic(IID));
+      switch (IID) {
+      case GenXIntrinsic::genx_line:
+      case GenXIntrinsic::genx_pln:
+      case GenXIntrinsic::genx_dp2:
+      case GenXIntrinsic::genx_dp3:
+      case GenXIntrinsic::genx_dp4:
+      case GenXIntrinsic::genx_ssdp4a:
+      case GenXIntrinsic::genx_sudp4a:
+      case GenXIntrinsic::genx_usdp4a:
+      case GenXIntrinsic::genx_uudp4a:
+      case GenXIntrinsic::genx_ssdp4a_sat:
+      case GenXIntrinsic::genx_sudp4a_sat:
+      case GenXIntrinsic::genx_usdp4a_sat:
+      case GenXIntrinsic::genx_uudp4a_sat:
+      case GenXIntrinsic::genx_dph:
+      case GenXIntrinsic::genx_transpose_ld:
+      case GenXIntrinsic::genx_oword_ld:
+      case GenXIntrinsic::genx_oword_ld_unaligned:
+      case GenXIntrinsic::genx_oword_st:
+      case GenXIntrinsic::genx_svm_block_ld:
+      case GenXIntrinsic::genx_svm_block_ld_unaligned:
+      case GenXIntrinsic::genx_svm_block_st:
+      case GenXIntrinsic::genx_load:
+      case GenXIntrinsic::genx_3d_load:
+      case GenXIntrinsic::genx_3d_sample:
+      case GenXIntrinsic::genx_avs:
+      case GenXIntrinsic::genx_sample:
+      case GenXIntrinsic::genx_sample_unorm:
+      case GenXIntrinsic::genx_simdcf_any:
+      case GenXIntrinsic::genx_simdcf_goto:
+      case GenXIntrinsic::genx_simdcf_join:
+      case GenXIntrinsic::genx_simdcf_predicate:
+      case GenXIntrinsic::genx_rdpredregion:
+      case GenXIntrinsic::genx_wrconstregion:
+      case GenXIntrinsic::genx_wrpredregion:
+      case GenXIntrinsic::genx_wrpredpredregion:
+      case GenXIntrinsic::genx_output:
+      case GenXIntrinsic::genx_va_1d_convolve_horizontal:
+      case GenXIntrinsic::genx_va_1d_convolve_vertical:
+      case GenXIntrinsic::genx_va_1pixel_convolve:
+      case GenXIntrinsic::genx_va_1pixel_convolve_1x1mode:
+      case GenXIntrinsic::genx_va_bool_centroid:
+      case GenXIntrinsic::genx_va_centroid:
+      case GenXIntrinsic::genx_va_convolve2d:
+      case GenXIntrinsic::genx_va_correlation_search:
+      case GenXIntrinsic::genx_va_dilate:
+      case GenXIntrinsic::genx_va_erode:
+      case GenXIntrinsic::genx_va_flood_fill:
+      case GenXIntrinsic::genx_va_hdc_1d_convolve_horizontal:
+      case GenXIntrinsic::genx_va_hdc_1d_convolve_vertical:
+      case GenXIntrinsic::genx_va_hdc_1pixel_convolve:
+      case GenXIntrinsic::genx_va_hdc_convolve2d:
+      case GenXIntrinsic::genx_va_hdc_dilate:
+      case GenXIntrinsic::genx_va_hdc_erode:
+      case GenXIntrinsic::genx_va_hdc_lbp_correlation:
+      case GenXIntrinsic::genx_va_hdc_lbp_creation:
+      case GenXIntrinsic::genx_va_hdc_minmax_filter:
+      case GenXIntrinsic::genx_va_lbp_correlation:
+      case GenXIntrinsic::genx_va_lbp_creation:
+      case GenXIntrinsic::genx_va_minmax:
+      case GenXIntrinsic::genx_va_minmax_filter:
+      case GenXIntrinsic::genx_media_ld:
+      case GenXIntrinsic::genx_media_st:
+      case GenXIntrinsic::genx_raw_send:
+      case GenXIntrinsic::genx_raw_send_noresult:
+      case GenXIntrinsic::genx_raw_sends:
+      case GenXIntrinsic::genx_raw_sends_noresult:
+        report_fatal_error("Unsupported genx intrinsic in SIMT mode.");
+        return nullptr;
+      case GenXIntrinsic::genx_dword_atomic_add:
+      case GenXIntrinsic::genx_dword_atomic_sub:
+      case GenXIntrinsic::genx_dword_atomic_min:
+      case GenXIntrinsic::genx_dword_atomic_max:
+      case GenXIntrinsic::genx_dword_atomic_xchg:
+      case GenXIntrinsic::genx_dword_atomic_and:
+      case GenXIntrinsic::genx_dword_atomic_or:
+      case GenXIntrinsic::genx_dword_atomic_xor:
+      case GenXIntrinsic::genx_dword_atomic_imin:
+      case GenXIntrinsic::genx_dword_atomic_imax:
+      case GenXIntrinsic::genx_dword_atomic_fmin:
+      case GenXIntrinsic::genx_dword_atomic_fmax:
+      {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *Src2 = getPacketizeValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Args[] = {Src0, BTI, Src2, Src3, Src4};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType(), Src2->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_dword_atomic_inc:
+      case GenXIntrinsic::genx_dword_atomic_dec: {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *Src2 = getPacketizeValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Args[] = {Src0, BTI, Src2, Src3};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_dword_atomic_fcmpwr: {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *Src2 = getPacketizeValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType(), Src2->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_dword_atomic_cmpxchg: {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *Src2 = getPacketizeValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_svm_gather: {
+        Value *Predicate = getPacketizeValue(CI->getOperand(0));
+        Value *NBlk = CI->getOperand(1);
+        assert(isa<Constant>(NBlk));
+        Value *Addr = getPacketizeValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Args[] = {Predicate, NBlk, Addr, Src3};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Predicate->getType(), Addr->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_svm_scatter: {
+        Value *Predicate = getPacketizeValue(CI->getOperand(0));
+        Value *NBlk = CI->getOperand(1);
+        assert(isa<Constant>(NBlk));
+        Value *Addr = getPacketizeValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Args[] = {Predicate, NBlk, Addr, Src3};
+        // store, no return type
+        Type *Tys[] = {Predicate->getType(), Addr->getType(), Src3->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_svm_gather4_scaled: {
+        Value *Predicate = getPacketizeValue(CI->getOperand(0));
+        Value *ChMask = CI->getOperand(1);
+        assert(isa<Constant>(ChMask));
+        Value *Scale = CI->getOperand(2);
+        assert(isa<Constant>(Scale));
+        Value *Addr = getUniformValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Args[] = {Predicate, ChMask, Scale, Addr, Src4, Src5};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Predicate->getType(), Src4->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_svm_scatter4_scaled: {
+        Value *Predicate = getPacketizeValue(CI->getOperand(0));
+        Value *ChMask = CI->getOperand(1);
+        assert(isa<Constant>(ChMask));
+        Value *Scale = CI->getOperand(2);
+        assert(isa<Constant>(Scale));
+        Value *Addr = getUniformValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Args[] = {Predicate, ChMask, Scale, Addr, Src4, Src5};
+        // store no return type
+        Type *Tys[] = {Predicate->getType(), Addr->getType(), Src4->getType(),
+                       Src5->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_gather4_typed: {
+        Value *ChMask = CI->getOperand(0);
+        assert(isa<Constant>(ChMask));
+        Value *Predicate = getPacketizeValue(CI->getOperand(1));
+        Value *BTI = getUniformValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Src6 = getPacketizeValue(CI->getOperand(6));
+        Value *Args[] = {ChMask, Predicate, BTI, Src3, Src4, Src5, Src6};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Predicate->getType(), Src3->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_scatter4_typed: {
+        Value *ChMask = CI->getOperand(0);
+        assert(isa<Constant>(ChMask));
+        Value *Predicate = getPacketizeValue(CI->getOperand(1));
+        Value *BTI = getUniformValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Src6 = getPacketizeValue(CI->getOperand(6));
+        Value *Args[] = {ChMask, Predicate, BTI, Src3, Src4, Src5, Src6};
+        // store no return type
+        Type *Tys[] = {Predicate->getType(), Src3->getType(), Src6->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_scatter4_scaled:
+      case GenXIntrinsic::genx_scatter_scaled: {
+        Value *Predicate = getPacketizeValue(CI->getOperand(0));
+        Value *NBlk = CI->getOperand(1); // or channel mask for scatter4
+        assert(isa<Constant>(NBlk));
+        Value *Scale = CI->getOperand(2);
+        assert(isa<Constant>(Scale));
+        Value *BTI = getUniformValue(CI->getOperand(3));
+        Value *GOff = getUniformValue(CI->getOperand(4));
+        Value *ElemOffsets = getPacketizeValue(CI->getOperand(5));
+        Value *InData = getPacketizeValue(CI->getOperand(6));
+        Value *Args[] = {Predicate, NBlk,        Scale, BTI,
+                         GOff,      ElemOffsets, InData};
+        // no return value for store
+        Type *Tys[] = {Args[0]->getType(), Args[5]->getType(),
+                       Args[6]->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_gather4_scaled:
+      case GenXIntrinsic::genx_gather_scaled: {
+        Value *Predicate = getPacketizeValue(CI->getOperand(0));
+        Value *NBlk = CI->getOperand(1); // or channel mask for gather4
+        assert(isa<Constant>(NBlk));
+        Value *Scale = CI->getOperand(2);
+        assert(isa<Constant>(Scale));
+        Value *BTI = getUniformValue(CI->getOperand(3));
+        Value *GOff = getUniformValue(CI->getOperand(4));
+        Value *ElemOffsets = getPacketizeValue(CI->getOperand(5));
+        Value *InData = getPacketizeValue(CI->getOperand(6));
+        Value *Args[] = {Predicate, NBlk,        Scale, BTI,
+                         GOff,      ElemOffsets, InData};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Args[0]->getType(), Args[5]->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_gather4_scaled2:
+      case GenXIntrinsic::genx_gather_scaled2: {
+        Value *NBlk = CI->getOperand(0);
+        assert(isa<Constant>(NBlk));
+        Value *Scale = CI->getOperand(1);
+        assert(isa<Constant>(Scale));
+        Value *BTI = getUniformValue(CI->getOperand(2));
+        Value *GOff = getUniformValue(CI->getOperand(3));
+        Value *ElemOffsets = getPacketizeValue(CI->getOperand(4));
+        Value *Args[] = {NBlk, Scale, BTI, GOff, ElemOffsets};
+        Type *RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, ElemOffsets->getType()};
+        Function *Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      }
+      case GenXIntrinsic::genx_lane_id: {
+        assert((CI->getType()->getIntegerBitWidth() == 32) &&
+               "Expected to return 32-bit integer.");
+        if (B->mVWidth == 8) {
+          std::initializer_list<uint32_t> l = {0, 1, 2, 3, 4, 5, 6, 7};
+          replacement = B->C(l);
+        } else if (B->mVWidth == 16) {
+          std::initializer_list<uint32_t> l = {0, 1, 2,  3,  4,  5,  6,  7,
+                                               8, 9, 10, 11, 12, 13, 14, 15};
+          replacement = B->C(l);
+        } else if (B->mVWidth == 32) {
+          std::initializer_list<uint32_t> l = {
+              0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+              16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+          replacement = B->C(l);
+        } else
+          assert(false);
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_rdregionf:
+      case GenXIntrinsic::genx_rdregioni: {
+        // packetize intrinsic operands
+        const DebugLoc &DL = CI->getDebugLoc();
+        auto OrigV0 = CI->getOperand(0);
+        CMRegion R(CI);
+        assert(R.Width == 1);
+        if (OrigV0->getType()->getVectorNumElements() == 1) {
+          replacement = getPacketizeValue(OrigV0);
+        } else {
+          R.NumElements = B->mVWidth;
+          if (R.Indirect) {
+            R.Indirect = getPacketizeValue(R.Indirect);
+          }
+          replacement = R.createRdRegion(getPacketizeValue(OrigV0),
+                                         CI->getName(), CI, DL);
+        }
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_wrregionf:
+      case GenXIntrinsic::genx_wrregioni: {
+        auto NewV0 = CI->getOperand(1);
+        const DebugLoc &DL = CI->getDebugLoc();
+        CMRegion R(CI);
+        assert(isa<VectorType>(NewV0->getType()));
+        assert(NewV0->getType()->getVectorNumElements() == 1);
+        auto NewV1 = getPacketizeValue(NewV0);
+        R.NumElements = B->mVWidth;
+        if (R.Indirect) {
+          R.Indirect = getPacketizeValue(R.Indirect);
+        }
+        replacement =
+            R.createWrRegion(CI->getOperand(0), NewV1, CI->getName(), CI, DL);
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_untyped_atomic_add:
+      case GenXIntrinsic::genx_untyped_atomic_sub:
+      case GenXIntrinsic::genx_untyped_atomic_min:
+      case GenXIntrinsic::genx_untyped_atomic_max:
+      case GenXIntrinsic::genx_untyped_atomic_xchg:
+      case GenXIntrinsic::genx_untyped_atomic_and:
+      case GenXIntrinsic::genx_untyped_atomic_or:
+      case GenXIntrinsic::genx_untyped_atomic_xor:
+      case GenXIntrinsic::genx_untyped_atomic_imin:
+      case GenXIntrinsic::genx_untyped_atomic_imax: {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *GOFF = getUniformValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Args[] = {Src0, BTI, GOFF, Src3, Src4, Src5};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_untyped_atomic_inc:
+      case GenXIntrinsic::genx_untyped_atomic_dec: {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *GOFF = getUniformValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Args[] = {Src0, BTI, GOFF, Src3, Src4};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_untyped_atomic_cmpxchg: {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *GOFF = getUniformValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Src6 = getPacketizeValue(CI->getOperand(6));
+        Value *Args[] = {Src0, BTI, GOFF, Src3, Src4, Src5, Src6};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+
+      case GenXIntrinsic::genx_typed_atomic_add:
+      case GenXIntrinsic::genx_typed_atomic_sub:
+      case GenXIntrinsic::genx_typed_atomic_min:
+      case GenXIntrinsic::genx_typed_atomic_max:
+      case GenXIntrinsic::genx_typed_atomic_xchg:
+      case GenXIntrinsic::genx_typed_atomic_and:
+      case GenXIntrinsic::genx_typed_atomic_or:
+      case GenXIntrinsic::genx_typed_atomic_xor:
+      case GenXIntrinsic::genx_typed_atomic_imin:
+      case GenXIntrinsic::genx_typed_atomic_imax:
+      case GenXIntrinsic::genx_typed_atomic_fmin:
+      case GenXIntrinsic::genx_typed_atomic_fmax:
+      {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *Src2 = getPacketizeValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Src6 = getPacketizeValue(CI->getOperand(6));
+        Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5, Src6};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType(), Src3->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_typed_atomic_inc:
+      case GenXIntrinsic::genx_typed_atomic_dec: {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *Src2 = getPacketizeValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType(), Src2->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      case GenXIntrinsic::genx_typed_atomic_fcmpwr:
+      case GenXIntrinsic::genx_typed_atomic_cmpxchg: {
+        Value *Src0 = getPacketizeValue(CI->getOperand(0));
+        Value *BTI = getUniformValue(CI->getOperand(1));
+        Value *Src2 = getPacketizeValue(CI->getOperand(2));
+        Value *Src3 = getPacketizeValue(CI->getOperand(3));
+        Value *Src4 = getPacketizeValue(CI->getOperand(4));
+        Value *Src5 = getPacketizeValue(CI->getOperand(5));
+        Value *Src6 = getPacketizeValue(CI->getOperand(6));
+        Value *Src7 = getPacketizeValue(CI->getOperand(7));
+        Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5, Src6, Src7};
+        auto RetTy = B->GetVectorType(CI->getType());
+        Type *Tys[] = {RetTy, Src0->getType(), Src4->getType()};
+        auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+        replacement = CallInst::Create(Decl, Args, CI->getName(), CI);
+        cast<CallInst>(replacement)->setDebugLoc(CI->getDebugLoc());
+        return replacement;
+      } break;
+      // default llvm-intrinsic packetizing rule should work for svm atomics
+      default:
+        break;
+      }
+    }
+  }
+  return nullptr;
+}
+
+/// - map old instruction to new in case we revisit the old instruction
+Value *GenXPacketize::packetizeInstruction(Instruction *pInst) {
+  // determine instruction type and call its packetizer
+  Value *pResult = packetizeGenXIntrinsic(pInst);
+  if (!pResult) {
+    if (IsLLVMIntrinsic(pInst))
+      pResult = packetizeLLVMIntrinsic(pInst);
+    else
+      pResult = packetizeLLVMInstruction(pInst);
+  }
+
+  if (pResult) {
+    if (pInst->getName() != "") {
+      pResult->setName(pInst->getName());
+    }
+
+    // Copy any metadata to new instruction
+    if (pResult != pInst && isa<Instruction>(pResult)) {
+      cast<Instruction>(pResult)->copyMetadata(*pInst);
+    }
+  }
+
+  return pResult;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Replace all uses but avoid any type checking as instructions
+///        maybe in a partial bad state.
+/// @param pInst - old instruction we're replacing.
+/// @param pNewInst - new instruction
+void GenXPacketize::replaceAllUsesNoTypeCheck(Value *pInst, Value *pNewInst) {
+  SmallVector<User *, 8> users;
+  SmallVector<uint32_t, 8> opNum;
+
+  for (auto &U : pInst->uses()) {
+    users.push_back(U.getUser());
+    opNum.push_back(U.getOperandNo());
+  }
+  for (uint32_t i = 0; i < users.size(); ++i) {
+    users[i]->setOperand(opNum[i], pNewInst);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Remove replaced instructions. DCE will not remove calls, etc.
+///        So we have to remove these manually.
+void GenXPacketize::removeDeadInstructions(Function &F) {
+  SmallVector<Instruction *, 8> unused;
+  for (auto RMI : ReplaceMap) {
+    if (RMI.first != RMI.second) {
+      if (Instruction *UnusedInst =
+              (Instruction *)dyn_cast<Instruction>(RMI.first)) {
+        unused.push_back(UnusedInst);
+      }
+    }
+  }
+  for (auto UnusedInst : unused) {
+    UnusedInst->replaceAllUsesWith(UndefValue::get(UnusedInst->getType()));
+    UnusedInst->eraseFromParent();
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief LLVM optimizes certain operations and replaces with general C
+/// functions instead
+///        of llvm intrinsics (sqrtf() instead of llvm.sqrt() for example). We
+///        convert these back to known llvm intrinsics before packetization,
+///        which are handled natively
+/// @param F - function to analyze
+void GenXPacketize::fixupLLVMIntrinsics(Function &F) {
+  std::unordered_set<Instruction *> removeSet;
+
+  for (auto &BB : F.getBasicBlockList()) {
+    for (auto &I : BB.getInstList()) {
+      if (isa<CallInst>(I)) {
+        CallInst *pCallInst = cast<CallInst>(&I);
+        Function *pFunc = pCallInst->getCalledFunction();
+        if (pFunc) {
+          if (pFunc->getName().startswith("sqrt")) {
+            B->IRB()->SetInsertPoint(&I);
+            Value *pSqrt = B->VSQRTPS(pCallInst->getOperand(0));
+            pCallInst->replaceAllUsesWith(pSqrt);
+            removeSet.insert(pCallInst);
+          } else if (pFunc->getName().startswith("fabs")) {
+            B->IRB()->SetInsertPoint(&I);
+            Value *pFabs = B->FABS(pCallInst->getOperand(0));
+            pCallInst->replaceAllUsesWith(pFabs);
+            removeSet.insert(pCallInst);
+          } else if (pFunc->getName().startswith("exp2")) {
+            B->IRB()->SetInsertPoint(&I);
+            Value *pExp2 = B->EXP2(pCallInst->getOperand(0));
+            pCallInst->replaceAllUsesWith(pExp2);
+            removeSet.insert(pCallInst);
+          } else if (pFunc->getName().equals("ldexpf")) {
+            B->IRB()->SetInsertPoint(&I);
+            Value *pArg = pCallInst->getOperand(0);
+            Value *pExp = pCallInst->getOperand(1);
+
+            // replace ldexp with arg * 2^exp = arg * (2 << arg)
+            Value *pShift = B->SHL(B->C(1), pExp);
+            pShift = B->UI_TO_FP(pShift, B->mFP32Ty);
+            Value *pResult = B->FMUL(pArg, pShift);
+            pCallInst->replaceAllUsesWith(pResult);
+            removeSet.insert(pCallInst);
+          }
+        }
+      }
+    }
+  }
+
+  for (auto *pInst : removeSet) {
+    pInst->eraseFromParent();
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief find the global ExecMask variable if exists in order to lower
+/// CM SIMD control-flow representation after packetization
+GlobalVariable *GenXPacketize::findGlobalExecMask() {
+  // look for the global EMask variable if exists
+  for (auto &Global : M->getGlobalList()) {
+    auto Ty = Global.getType()->getElementType();
+    if (Ty->isVectorTy() &&
+        Ty->getVectorNumElements() == CMSimdCFLower::MAX_SIMD_CF_WIDTH) {
+      auto ElemTy = Ty->getVectorElementType();
+      if (ElemTy->isIntegerTy() && ElemTy->getIntegerBitWidth() == 1) {
+        // so far the type is right, then check the use
+        for (auto EMUI = Global.use_begin(), EMUE = Global.use_end();
+             EMUI != EMUE; ++EMUI) {
+          if (auto LD = dyn_cast<LoadInst>(EMUI->getUser())) {
+            for (auto UI = LD->user_begin(), E = LD->user_end(); UI != E;
+                 ++UI) {
+              const Value *LocalUse = (*UI);
+              if (auto CI = dyn_cast_or_null<CallInst>(LocalUse)) {
+                if (Function *Callee = CI->getCalledFunction()) {
+                  if (GenXIntrinsic::getGenXIntrinsicID(Callee) ==
+                      GenXIntrinsic::genx_simdcf_goto)
+                    return &Global;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+//////////////////////////////////////////////////////////////////////////
+/// @brief lower CM SIMD control-flow representation after packetization
+///
+void GenXPacketize::lowerControlFlowAfter(std::vector<Function *> &SIMTFuncs) {
+  auto EMVar = findGlobalExecMask();
+  // create one if we cannot find one.
+  if (!EMVar) {
+    auto EMTy = VectorType::get(Type::getInt1Ty(M->getContext()),
+                                CMSimdCFLower::MAX_SIMD_CF_WIDTH);
+    EMVar = new GlobalVariable(*M, EMTy, false /*isConstant*/,
+                               GlobalValue::InternalLinkage,
+                               Constant::getAllOnesValue(EMTy), "EM");
+  }
+  CMSimdCFLower CFL(EMVar);
+  // Derive an order to process functions such that a function is visited
+  // after anything that calls it.
+  int n = SIMTFuncs.size();
+  for (int i = n - 1; i >= 0; --i)
+    CFL.processFunction(SIMTFuncs[i]);
+}
+
+// foward declare the initializer
+void initializeGenXPacketizePass(PassRegistry &);
+
+} // namespace llvm
+
+using namespace llvm;
+
+char GenXPacketize::ID = 0;
+INITIALIZE_PASS_BEGIN(GenXPacketize, "GenXPacketize", "GenXPacketize", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_END(GenXPacketize, "GenXPacketize", "GenXPacketize", false,
+                    false)
+
+namespace llvm {
+ModulePass *createGenXPacketizePass() {
+  initializeGenXPacketizePass(*PassRegistry::getPassRegistry());
+  return new GenXPacketize();
+}
+} // namespace llvm
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.cpp
new file mode 100644
index 000000000000..c09ff72d3b1b
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.cpp
@@ -0,0 +1,209 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "PacketBuilder.h"
+
+using namespace llvm;
+
+namespace pktz
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Contructor for Builder.
+    /// @param pJitMgr - JitManager which contains modules, function passes, etc.
+    PacketBuilder::PacketBuilder(Module *pModule, uint32_t width)
+    {
+        mVWidth16 = 16;
+        mpModule = static_cast<IGCLLVM::Module*>(pModule);
+
+        // Built in types: scalar
+        LLVMContext& Ctx = getContext();
+        mpIRBuilder = new IGCLLVM::IRBuilder<>(Ctx);
+        mVoidTy     = Type::getVoidTy(Ctx);
+        mFP16Ty     = Type::getHalfTy(Ctx);
+        mFP32Ty     = Type::getFloatTy(Ctx);
+        mFP32PtrTy  = PointerType::get(mFP32Ty, 0);
+        mDoubleTy   = Type::getDoubleTy(Ctx);
+        mInt1Ty     = Type::getInt1Ty(Ctx);
+        mInt8Ty     = Type::getInt8Ty(Ctx);
+        mInt16Ty    = Type::getInt16Ty(Ctx);
+        mInt32Ty    = Type::getInt32Ty(Ctx);
+        mInt8PtrTy  = PointerType::get(mInt8Ty, 0);
+        mInt16PtrTy = PointerType::get(mInt16Ty, 0);
+        mInt32PtrTy = PointerType::get(mInt32Ty, 0);
+        mInt64Ty    = Type::getInt64Ty(Ctx);
+
+        mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);
+
+        // Built in types: simd16
+        mSimd16Int1Ty     = VectorType::get(mInt1Ty, mVWidth16);
+        mSimd16Int16Ty    = VectorType::get(mInt16Ty, mVWidth16);
+        mSimd16Int32Ty    = VectorType::get(mInt32Ty, mVWidth16);
+        mSimd16Int64Ty    = VectorType::get(mInt64Ty, mVWidth16);
+        mSimd16FP16Ty     = VectorType::get(mFP16Ty, mVWidth16);
+        mSimd16FP32Ty     = VectorType::get(mFP32Ty, mVWidth16);
+
+        mSimd32Int8Ty = VectorType::get(mInt8Ty, 32);
+
+        if (sizeof(uint32_t*) == 4)
+        {
+            mIntPtrTy       = mInt32Ty;
+            mSimd16IntPtrTy = mSimd16Int32Ty;
+        }
+        else
+        {
+            assert(sizeof(uint32_t*) == 8);
+            mIntPtrTy       = mInt64Ty;
+            mSimd16IntPtrTy = mSimd16Int64Ty;
+        }
+        // Built in types: target simd
+        SetTargetWidth(width);
+
+    }
+
+    void PacketBuilder::SetTargetWidth(uint32_t width)
+    {
+        mVWidth = width;
+
+        mSimdInt1Ty      = VectorType::get(mInt1Ty, mVWidth);
+        mSimdInt16Ty     = VectorType::get(mInt16Ty, mVWidth);
+        mSimdInt32Ty     = VectorType::get(mInt32Ty, mVWidth);
+        mSimdInt64Ty     = VectorType::get(mInt64Ty, mVWidth);
+        mSimdFP16Ty      = VectorType::get(mFP16Ty, mVWidth);
+        mSimdFP32Ty      = VectorType::get(mFP32Ty, mVWidth);
+        if (sizeof(uint32_t*) == 4)
+        {
+          mSimdIntPtrTy = mSimdInt32Ty;
+        }
+        else
+        {
+          assert(sizeof(uint32_t*) == 8);
+          mSimdIntPtrTy = mSimdInt64Ty;
+        }
+    }
+
+    /// @brief Mark this alloca as temporary to avoid hoisting later on
+    void PacketBuilder::SetTempAlloca(Value* inst)
+    {
+        AllocaInst* pAlloca = dyn_cast<AllocaInst>(inst);
+        assert(pAlloca && "Unexpected non-alloca instruction");
+        MDNode* N = MDNode::get(getContext(), MDString::get(getContext(), "is_temp_alloca"));
+        pAlloca->setMetadata("is_temp_alloca", N);
+    }
+
+    bool PacketBuilder::IsTempAlloca(Value* inst)
+    {
+        AllocaInst* pAlloca = dyn_cast<AllocaInst>(inst);
+        assert(pAlloca && "Unexpected non-alloca instruction");
+
+        return (pAlloca->getMetadata("is_temp_alloca") != nullptr);
+    }
+
+    // Returns true if able to find a call instruction to mark
+    bool PacketBuilder::SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName)
+    {
+        CallInst* pCallInstr = dyn_cast<CallInst>(inst);
+        if (pCallInstr)
+        {
+            MDNode* N = MDNode::get(getContext(), MDString::get(getContext(), mdName));
+            pCallInstr->setMetadata(mdName, N);
+            return true;
+        }
+        else
+        {
+            // Follow use def chain back up
+            for (Use& u : inst->operands())
+            {
+                Instruction* srcInst = dyn_cast<Instruction>(u.get());
+                if (srcInst)
+                {
+                    if (SetNamedMetaDataOnCallInstr(srcInst, mdName))
+                    {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        return false;
+    }
+
+    bool PacketBuilder::HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName)
+    {
+        CallInst* pCallInstr = dyn_cast<CallInst>(inst);
+
+        if (!pCallInstr)
+        {
+            return false;
+        }
+
+        return (pCallInstr->getMetadata(mdName) != nullptr);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Packetizes the type. Assumes SOA conversion.
+    Type* PacketBuilder::GetVectorType(Type* pType)
+    {
+        if (pType->isVoidTy())
+            return pType;
+
+        if (pType->isVectorTy())
+        {
+            uint32_t vectorSize = pType->getVectorNumElements();
+            Type*    pElemType = pType->getVectorElementType();
+            Type*    pVecType = VectorType::get(pElemType, vectorSize*mVWidth);
+            return pVecType;
+        }
+
+        // [N x float] should packetize to [N x <8 x float>]
+        if (pType->isArrayTy())
+        {
+            uint32_t arraySize     = pType->getArrayNumElements();
+            Type*    pArrayType    = pType->getArrayElementType();
+            Type*    pVecArrayType = GetVectorType(pArrayType);
+            Type*    pVecType      = ArrayType::get(pVecArrayType, arraySize);
+            return pVecType;
+        }
+
+        // {float,int} should packetize to {<8 x float>, <8 x int>}
+        if (pType->isAggregateType())
+        {
+            uint32_t              numElems = pType->getStructNumElements();
+            SmallVector<Type*, 8> vecTypes;
+            for (uint32_t i = 0; i < numElems; ++i)
+            {
+                Type* pElemType    = pType->getStructElementType(i);
+                Type* pVecElemType = GetVectorType(pElemType);
+                vecTypes.push_back(pVecElemType);
+            }
+            Type* pVecType = StructType::get(getContext(), vecTypes);
+            return pVecType;
+        }
+
+        // <ty> should packetize to <8 x <ty>>
+        Type* vecType = VectorType::get(pType, mVWidth);
+        return vecType;
+    }
+} // end of namespace pktz
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.h b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.h
new file mode 100644
index 000000000000..5d3586148bb5
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.h
@@ -0,0 +1,340 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+#pragma once
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+#include "llvmWrapper/IR/IRBuilder.h"
+#include "llvmWrapper/IR/InstrTypes.h"
+#include "llvmWrapper/IR/Module.h"
+
+#include <deque>
+#include <unordered_map>
+
+using namespace llvm;
+
+namespace pktz
+{
+    struct PacketBuilder
+    {
+    public:
+        PacketBuilder(Module* pModule, uint32_t width = 16);
+        virtual ~PacketBuilder()
+        {
+            if (mpIRBuilder)
+                delete mpIRBuilder;
+        }
+
+        IGCLLVM::IRBuilder<>* IRB() { return mpIRBuilder; };
+        LLVMContext &getContext() { return mpModule->getContext(); }
+
+        IGCLLVM::Module*     mpModule;
+        IGCLLVM::IRBuilder<>* mpIRBuilder;
+
+        uint32_t mVWidth;   // vector width target simd
+        uint32_t mVWidth16; // vector width simd16
+
+        // Built in types: scalar
+        Type* mVoidTy;
+        Type* mInt1Ty;
+        Type* mInt8Ty;
+        Type* mInt16Ty;
+        Type* mInt32Ty;
+        Type* mInt64Ty;
+        Type* mIntPtrTy;
+        Type* mFP16Ty;
+        Type* mFP32Ty;
+        Type* mFP32PtrTy;
+        Type* mDoubleTy;
+        Type* mInt8PtrTy;
+        Type* mInt16PtrTy;
+        Type* mInt32PtrTy;
+
+        Type* mSimd4FP64Ty;
+
+        // Built in types: target SIMD
+        Type* mSimdFP16Ty;
+        Type* mSimdFP32Ty;
+        Type* mSimdInt1Ty;
+        Type* mSimdInt16Ty;
+        Type* mSimdInt32Ty;
+        Type* mSimdInt64Ty;
+        Type* mSimdIntPtrTy;
+
+        // Built in types: simd16
+
+        Type* mSimd16FP16Ty;
+        Type* mSimd16FP32Ty;
+        Type* mSimd16Int1Ty;
+        Type* mSimd16Int16Ty;
+        Type* mSimd16Int32Ty;
+        Type* mSimd16Int64Ty;
+        Type* mSimd16IntPtrTy;
+
+        Type* mSimd32Int8Ty;
+
+        void  SetTargetWidth(uint32_t width);
+        void  SetTempAlloca(Value* inst);
+        bool  IsTempAlloca(Value* inst);
+        bool  SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName);
+        bool  HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName);
+        Type* GetVectorType(Type* pType);
+        void  SetMetadata(StringRef s, uint32_t val)
+        {
+            llvm::NamedMDNode* metaData = mpModule->getOrInsertNamedMetadata(s);
+            Constant*          cval     = IRB()->getInt32(val);
+            llvm::MDNode*      mdNode   = llvm::MDNode::get(getContext(),
+                                                     llvm::ConstantAsMetadata::get(cval));
+            if (metaData->getNumOperands())
+            {
+                metaData->setOperand(0, mdNode);
+            }
+            else
+            {
+                metaData->addOperand(mdNode);
+            }
+        }
+        uint32_t GetMetadata(StringRef s)
+        {
+            NamedMDNode* metaData = mpModule->getNamedMetadata(s);
+            if (metaData)
+            {
+                MDNode*   mdNode = metaData->getOperand(0);
+                Metadata* val    = mdNode->getOperand(0);
+                return mdconst::dyn_extract<ConstantInt>(val)->getZExtValue();
+            }
+            else
+            {
+                return 0;
+            }
+        }
+#include "gen_builder.hpp"
+#include "gen_builder_intrin.hpp"
+#include "gen_builder_meta.hpp"
+
+        Value* VLOG2PS(Value* src);
+        Value* VPOW24PS(Value* src);
+        Value* VEXP2PS(Value* src);
+
+        //#include "PacketBuilder_misc.h"
+        Constant* C(bool i);
+        Constant* C(char i);
+        Constant* C(uint8_t i);
+        Constant* C(int i);
+        Constant* C(int64_t i);
+        Constant* C(uint64_t i);
+        Constant* C(uint16_t i);
+        Constant* C(uint32_t i);
+        Constant* C(float i);
+
+        template <typename Ty>
+        Constant* C(const std::initializer_list<Ty>& constList)
+        {
+          std::vector<Constant*> vConsts;
+          for (auto i : constList)
+          {
+            vConsts.push_back(C((Ty)i));
+          }
+          return ConstantVector::get(vConsts);
+        }
+
+        template <typename Ty>
+        Constant* CA(LLVMContext& ctx, ArrayRef<Ty> constList)
+        {
+          return ConstantDataArray::get(ctx, constList);
+        }
+
+        template <typename Ty>
+        Constant* CInc(uint32_t base, uint32_t count)
+        {
+          std::vector<Constant*> vConsts;
+
+          for (uint32_t i = 0; i < count; i++)
+          {
+            vConsts.push_back(C((Ty)base));
+            base++;
+          }
+          return ConstantVector::get(vConsts);
+        }
+
+        Constant* PRED(bool pred);
+
+        Value* VIMMED1(int i);
+        Value* VIMMED1_16(int i);
+
+        Value* VIMMED1(uint32_t i);
+        Value* VIMMED1_16(uint32_t i);
+
+        Value* VIMMED1(float i);
+        Value* VIMMED1_16(float i);
+
+        Value* VIMMED1(bool i);
+        Value* VIMMED1_16(bool i);
+
+        Value* VUNDEF(Type* t);
+
+        Value* VUNDEF_F();
+        Value* VUNDEF_F_16();
+
+        Value* VUNDEF_I();
+        Value* VUNDEF_I_16();
+
+        Value* VUNDEF(Type* ty, uint32_t size);
+
+        Value* VUNDEF_IPTR();
+
+        Value* VBROADCAST(Value* src, const llvm::Twine& name = "");
+        Value* VBROADCAST_16(Value* src);
+
+        Value* VRCP(Value* va, const llvm::Twine& name = "");
+        Value* VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY);
+
+        uint32_t IMMED(Value* i);
+        int32_t  S_IMMED(Value* i);
+
+        CallInst* CALL(Value* Callee, const std::initializer_list<Value*>& args, const llvm::Twine& name = "");
+        CallInst* CALL(Value* Callee)
+        {
+          return CALLA(Callee);
+        }
+        CallInst* CALL(Value* Callee, Value* arg);
+        CallInst* CALL2(Value* Callee, Value* arg1, Value* arg2);
+        CallInst* CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3);
+
+        Value* MASK(Value* vmask);
+        Value* MASK_16(Value* vmask);
+
+        Value* VMASK(Value* mask);
+        Value* VMASK_16(Value* mask);
+
+        Value* VMOVMSK(Value* mask);
+
+        //////////////////////////////////////////////////////////////////////////
+        /// @brief functions that build IR to call x86 intrinsics directly, or
+        /// emulate them with other instructions if not available on the host
+        //////////////////////////////////////////////////////////////////////////
+
+        Value* EXTRACT_16(Value* x, uint32_t imm);
+        Value* JOIN_16(Value* a, Value* b);
+
+        Value* PSHUFB(Value* a, Value* b);
+        Value* PMOVSXBD(Value* a);
+        Value* PMOVSXWD(Value* a);
+        Value* PMAXSD(Value* a, Value* b);
+        Value* PMINSD(Value* a, Value* b);
+        Value* PMAXUD(Value* a, Value* b);
+        Value* PMINUD(Value* a, Value* b);
+        Value* VABSPS(Value* a);
+        Value* FMADDPS(Value* a, Value* b, Value* c);
+
+        Value* ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = "");
+        Value* FCLAMP(Value* src, Value* low, Value* high);
+        Value* FCLAMP(Value* src, float low, float high);
+
+        Value* VPOPCNT(Value* a);
+
+        Value* VEXTRACTI128(Value* a, Constant* imm8);
+        Value* VINSERTI128(Value* a, Value* b, Constant* imm8);
+
+        Value* CreateEntryAlloca(Function* pFunc, Type* pType);
+        Value* CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize);
+
+        uint32_t GetTypeSize(Type* pType);
+
+      // #include "PacketBuilder_mem.h"
+      public:
+        typedef enum _JIT_MEM_CLIENT
+        {
+          MEM_CLIENT_INTERNAL,
+          GFX_MEM_CLIENT_FETCH,
+          GFX_MEM_CLIENT_SAMPLER,
+          GFX_MEM_CLIENT_SHADER,
+        } JIT_MEM_CLIENT;
+
+      protected:
+        virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset);
+        void           AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage);
+
+      public:
+        virtual Value* GEP(Value* Ptr, Value* Idx, Type* Ty = nullptr, const Twine& Name = "");
+        virtual Value* GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name = "");
+        virtual Value* GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty = nullptr);
+        virtual Value* GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty = nullptr);
+
+        Value* GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = "");
+        Value* GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = "");
+
+        Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList);
+        Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList);
+
+        virtual LoadInst* LOAD(Value* Ptr, const char* Name, Type* Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+
+        virtual LoadInst* LOAD(Value*         Ptr,
+          const Twine&   Name = "",
+          Type*          Ty = nullptr,
+          JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+
+        virtual LoadInst* LOAD(Type* Ty, Value* Ptr, const Twine& Name = "", JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+
+        virtual LoadInst* LOAD(Value*         Ptr,
+          bool           isVolatile,
+          const Twine&   Name = "",
+          Type*          Ty = nullptr,
+          JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+
+        virtual LoadInst* LOAD(Value*                                 BasePtr,
+          const std::initializer_list<uint32_t>& offset,
+          const llvm::Twine&                     Name = "",
+          Type*                                  Ty = nullptr,
+          JIT_MEM_CLIENT                         usage = MEM_CLIENT_INTERNAL);
+
+        virtual CallInst* MASKED_LOAD(Value*         Ptr,
+          unsigned       Align,
+          Value*         Mask,
+          Value*         PassThru = nullptr,
+          const Twine&   Name = "",
+          Type*          Ty = nullptr,
+          JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL)
+        {
+          return IRB()->CreateMaskedLoad(Ptr, Align, Mask, PassThru, Name);
+        }
+
+        LoadInst*  LOADV(Value* BasePtr, const std::initializer_list<Value*>& offset, const llvm::Twine& name = "");
+        StoreInst* STORE(Value* Val, Value* BasePtr, const std::initializer_list<uint32_t>& offset);
+        StoreInst* STOREV(Value* Val, Value* BasePtr, const std::initializer_list<Value*>& offset);
+
+        Value* MEM_ADD(Value*                                 i32Incr,
+          Value*                                 basePtr,
+          const std::initializer_list<uint32_t>& indices,
+          const llvm::Twine&                     name = "");
+    };
+} // end of namespace pktz
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_math.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_math.cpp
new file mode 100644
index 000000000000..620a146b1521
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_math.cpp
@@ -0,0 +1,163 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "PacketBuilder.h"
+
+// need to disable this to use INFINITY and NAN values
+#pragma warning(disable : 4756 4056)
+
+//#include <math.h>
+
+namespace pktz
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Computes log2(a) using either scalar log2 function from the runtime
+    ///        or vector approximation
+    /// @param a - src float vector
+    Value* PacketBuilder::VLOG2PS(Value* a)
+    {
+        Value* result;
+
+        // fast log2 approximation
+        // log2(x) = (x.ExpPart - 127) + log(1.xFracPart)
+        Value* asInt        = BITCAST(a, mSimdInt32Ty);
+        Value* b            = SUB(AND(ASHR(asInt, 23), 255), VIMMED1(127));
+        Value* intermResult = SI_TO_FP(b, mSimdFP32Ty);
+
+        Value* fa = OR(AND(asInt, VIMMED1(0x007FFFFF)), VIMMED1(127 << 23));
+        fa        = BITCAST(fa, mSimdFP32Ty);
+        fa        = FSUB(fa, VIMMED1(1.0f));
+
+        // log(x) = (1.4386183024320163f + (-0.640238532500937f +
+        // 0.20444600983623412f*fx)*fx)*fx;
+        result = FMUL(fa, VIMMED1(0.20444600983623412f));
+        result = FADD(result, VIMMED1(-0.640238532500937f));
+        result = FMUL(fa, result);
+        result = FADD(result, VIMMED1(1.4386183024320163f));
+        result = FMUL(result, fa);
+        result = FADD(result, intermResult);
+
+        // handle bad input
+        // 0 -> -inf
+        Value* zeroInput = FCMP_OEQ(a, VIMMED1(0.0f));
+        result           = SELECT(zeroInput, VIMMED1(-INFINITY), result);
+
+        // -F -> NAN
+        Value* negInput = FCMP_OLT(a, VIMMED1(0.0f));
+        result          = SELECT(negInput, VIMMED1(NAN), result);
+
+        // inf -> inf
+        Value* infInput = FCMP_OEQ(a, VIMMED1(INFINITY));
+        result          = SELECT(infInput, VIMMED1(INFINITY), result);
+
+        // NAN -> NAN
+        Value* nanInput = FCMP_UNO(a, a);
+        result          = SELECT(nanInput, VIMMED1(NAN), result);
+
+        result->setName("log2.");
+        return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Computes a^2.4 using either scalar pow function from the runtime
+    ///        or vector approximation
+    /// @param a - src float vector
+    Value* PacketBuilder::VPOW24PS(Value* a)
+    {
+        Value* result;
+        // approximation algorithm from
+        // http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent
+        // computes a^2.4 with approximately 5% overestimate.
+        // can reduce the error further with a few more terms
+
+        const float expnum   = 24;
+        const float expden   = 10;
+        const float coeffnum = 1.0f;
+        const float coeffden = 1.0f;
+
+        Value* correctionFactor =
+            VIMMED1(exp2f(127.f * expden / expnum - 127.f) *
+                    powf(1.f * coeffnum / coeffden, 1.0f * expden / expnum));
+
+        result = FMUL(a, correctionFactor);
+        result = SI_TO_FP(BITCAST(result, mSimdInt32Ty), mSimdFP32Ty);
+        result = FMUL(result, VIMMED1(1.f * expnum / expden));
+        result = BITCAST(FP_TO_SI(result, mSimdInt32Ty), mSimdFP32Ty);
+
+        result->setName("pow24.");
+        return result;
+    }
+
+#define EXP_POLY_DEGREE 3
+
+#define POLY0(x, c0) VIMMED1(c0)
+#define POLY1(x, c0, c1) FADD(FMUL(POLY0(x, c1), x), VIMMED1(c0))
+#define POLY2(x, c0, c1, c2) FADD(FMUL(POLY1(x, c1, c2), x), VIMMED1(c0))
+#define POLY3(x, c0, c1, c2, c3) FADD(FMUL(POLY2(x, c1, c2, c3), x), VIMMED1(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) FADD(FMUL(POLY3(x, c1, c2, c3, c4), x), VIMMED1(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) FADD(FMUL(POLY4(x, c1, c2, c3, c4, c5), x), VIMMED1(c0))
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Computes 2^x using either scalar pow function from the runtime
+    ///        or vector approximation
+    /// @param a - src float vector
+    Value* PacketBuilder::VEXP2PS(Value* a)
+    {
+        Value* result;
+
+        // fast exp2 taken from here:
+        // http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+
+        a = VMINPS(a, VIMMED1(129.0f));
+        a = VMAXPS(a, VIMMED1(-126.99999f));
+
+        Value* ipart    = FP_TO_SI(FSUB(a, VIMMED1(0.5f)), mSimdInt32Ty);
+        Value* fpart    = FSUB(a, SI_TO_FP(ipart, mSimdFP32Ty));
+        Value* expipart = BITCAST(SHL(ADD(ipart, VIMMED1(127)), 23), mSimdFP32Ty);
+#if EXP_POLY_DEGREE == 5
+        Value* expfpart = POLY5(fpart,
+                                9.9999994e-1f,
+                                6.9315308e-1f,
+                                2.4015361e-1f,
+                                5.5826318e-2f,
+                                8.9893397e-3f,
+                                1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+        Value* expfpart = POLY4(
+            fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+        Value* expfpart =
+            POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+        Value* expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+        result = FMUL(expipart, expfpart, "exp2.");
+
+        return result;
+    }
+}
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_mem.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_mem.cpp
new file mode 100644
index 000000000000..c77bb1ed6d7c
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_mem.cpp
@@ -0,0 +1,172 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "PacketBuilder.h"
+
+#include <cstdarg>
+
+namespace pktz
+{
+    void PacketBuilder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
+    {
+        assert(
+            ptr->getType() != mInt64Ty &&
+            "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
+    }
+
+    Value* PacketBuilder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name)
+    {
+        return IRB()->CreateGEP(Ptr, Idx, Name);
+    }
+
+    Value* PacketBuilder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
+    {
+        return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
+    }
+
+    Value* PacketBuilder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(i);
+        return GEPA(ptr, indices);
+    }
+
+    Value* PacketBuilder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(C(i));
+        return GEPA(ptr, indices);
+    }
+
+    Value* PacketBuilder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
+    {
+        return IRB()->CreateGEP(Ptr, IdxList, Name);
+    }
+
+    Value* PacketBuilder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
+    {
+        return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
+    }
+
+    Value* PacketBuilder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(i);
+        return IN_BOUNDS_GEP(ptr, indices);
+    }
+
+    Value* PacketBuilder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(C(i));
+        return IN_BOUNDS_GEP(ptr, indices);
+    }
+
+    LoadInst* PacketBuilder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
+    {
+        AssertMemoryUsageParams(Ptr, usage);
+        return IRB()->CreateLoad(Ptr, Name);
+    }
+
+    LoadInst* PacketBuilder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
+    {
+        AssertMemoryUsageParams(Ptr, usage);
+        return IRB()->CreateLoad(Ptr, Name);
+    }
+
+    LoadInst* PacketBuilder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage)
+    {
+        AssertMemoryUsageParams(Ptr, usage);
+        return IRB()->CreateLoad(Ty, Ptr, Name);
+    }
+
+    LoadInst*
+    PacketBuilder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
+    {
+        AssertMemoryUsageParams(Ptr, usage);
+        return IRB()->CreateLoad(Ptr, isVolatile, Name);
+    }
+
+    LoadInst* PacketBuilder::LOAD(Value*                                 basePtr,
+                            const std::initializer_list<uint32_t>& indices,
+                            const llvm::Twine&                     name,
+                            Type*                                  Ty,
+                            JIT_MEM_CLIENT                         usage)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(C(i));
+        return PacketBuilder::LOAD(GEPA(basePtr, valIndices), name);
+    }
+
+    LoadInst* PacketBuilder::LOADV(Value*                               basePtr,
+                             const std::initializer_list<Value*>& indices,
+                             const llvm::Twine&                   name)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(i);
+        return LOAD(GEPA(basePtr, valIndices), name);
+    }
+
+    StoreInst*
+    PacketBuilder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(C(i));
+        return STORE(val, GEPA(basePtr, valIndices));
+    }
+
+    StoreInst*
+    PacketBuilder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(i);
+        return STORE(val, GEPA(basePtr, valIndices));
+    }
+
+    Value* PacketBuilder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
+    {
+        return GEP(base, offset);
+    }
+
+    Value* PacketBuilder::MEM_ADD(Value*                                 i32Incr,
+                            Value*                                 basePtr,
+                            const std::initializer_list<uint32_t>& indices,
+                            const llvm::Twine&                     name)
+    {
+        Value* i32Value  = LOAD(GEP(basePtr, indices), name);
+        Value* i32Result = ADD(i32Value, i32Incr);
+        return STORE(i32Result, GEP(basePtr, indices));
+    }
+
+}
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_misc.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_misc.cpp
new file mode 100644
index 000000000000..131d9e5dd40b
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_misc.cpp
@@ -0,0 +1,503 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+#include "PacketBuilder.h"
+
+//#include <cstdarg>
+
+namespace pktz
+{
+    Constant* PacketBuilder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); }
+
+    Constant* PacketBuilder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
+
+    Constant* PacketBuilder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
+
+    Constant* PacketBuilder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
+
+    Constant* PacketBuilder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
+
+    Constant* PacketBuilder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); }
+
+    Constant* PacketBuilder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
+
+    Constant* PacketBuilder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
+
+    Constant* PacketBuilder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); }
+
+    Constant* PacketBuilder::PRED(bool pred)
+    {
+        return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
+    }
+
+    Value* PacketBuilder::VIMMED1(int i)
+    {
+        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+    }
+
+    Value* PacketBuilder::VIMMED1_16(int i)
+    {
+        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
+    }
+
+    Value* PacketBuilder::VIMMED1(uint32_t i)
+    {
+        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+    }
+
+    Value* PacketBuilder::VIMMED1_16(uint32_t i)
+    {
+        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
+    }
+
+    Value* PacketBuilder::VIMMED1(float i)
+    {
+        return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
+    }
+
+    Value* PacketBuilder::VIMMED1_16(float i)
+    {
+        return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
+    }
+
+    Value* PacketBuilder::VIMMED1(bool i)
+    {
+        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+    }
+
+    Value* PacketBuilder::VIMMED1_16(bool i)
+    {
+        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
+    }
+
+    Value* PacketBuilder::VUNDEF_IPTR() { return UndefValue::get(VectorType::get(mInt32PtrTy, mVWidth)); }
+
+    Value* PacketBuilder::VUNDEF(Type* t) { return UndefValue::get(VectorType::get(t, mVWidth)); }
+
+    Value* PacketBuilder::VUNDEF_I() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); }
+
+    Value* PacketBuilder::VUNDEF_I_16() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16)); }
+
+    Value* PacketBuilder::VUNDEF_F() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); }
+
+    Value* PacketBuilder::VUNDEF_F_16() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16)); }
+
+    Value* PacketBuilder::VUNDEF(Type* ty, uint32_t size)
+    {
+        return UndefValue::get(VectorType::get(ty, size));
+    }
+
+    Value* PacketBuilder::VBROADCAST(Value* src, const llvm::Twine& name)
+    {
+        // check if src is already a vector
+        if (src->getType()->isVectorTy())
+        {
+          if (auto CV = dyn_cast<ConstantVector>(src)) {
+            if (CV->getSplatValue()) {
+              return VECTOR_SPLAT(mVWidth*src->getType()->getVectorNumElements(),
+                                  CV->getSplatValue(), name);
+            }
+          }
+          return src;
+        }
+
+        return VECTOR_SPLAT(mVWidth, src, name);
+    }
+
+    Value* PacketBuilder::VBROADCAST_16(Value* src)
+    {
+        // check if src is already a vector
+        if (src->getType()->isVectorTy())
+        {
+            return src;
+        }
+
+        return VECTOR_SPLAT(mVWidth16, src);
+    }
+
+    uint32_t PacketBuilder::IMMED(Value* v)
+    {
+        assert(isa<ConstantInt>(v));
+        ConstantInt* pValConst = cast<ConstantInt>(v);
+        return pValConst->getZExtValue();
+    }
+
+    int32_t PacketBuilder::S_IMMED(Value* v)
+    {
+        assert(isa<ConstantInt>(v));
+        ConstantInt* pValConst = cast<ConstantInt>(v);
+        return pValConst->getSExtValue();
+    }
+
+    CallInst* PacketBuilder::CALL(Value*                               Callee,
+                            const std::initializer_list<Value*>& argsList,
+                            const llvm::Twine&                   name)
+    {
+        std::vector<Value*> args;
+        for (auto arg : argsList)
+            args.push_back(arg);
+        return CALLA(Callee, args, name);
+    }
+
+    CallInst* PacketBuilder::CALL(Value* Callee, Value* arg)
+    {
+        std::vector<Value*> args;
+        args.push_back(arg);
+        return CALLA(Callee, args);
+    }
+
+    CallInst* PacketBuilder::CALL2(Value* Callee, Value* arg1, Value* arg2)
+    {
+        std::vector<Value*> args;
+        args.push_back(arg1);
+        args.push_back(arg2);
+        return CALLA(Callee, args);
+    }
+
+    CallInst* PacketBuilder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3)
+    {
+        std::vector<Value*> args;
+        args.push_back(arg1);
+        args.push_back(arg2);
+        args.push_back(arg3);
+        return CALLA(Callee, args);
+    }
+
+    Value* PacketBuilder::VRCP(Value* va, const llvm::Twine& name)
+    {
+        return FDIV(VIMMED1(1.0f), va, name); // 1 / a
+    }
+
+    Value* PacketBuilder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY)
+    {
+        Value* vOut = FMADDPS(vA, vX, vC);
+        vOut        = FMADDPS(vB, vY, vOut);
+        return vOut;
+    }
+
+    Value* PacketBuilder::EXTRACT_16(Value* x, uint32_t imm)
+    {
+        if (imm == 0)
+        {
+            return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7});
+        }
+        else
+        {
+            return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15});
+        }
+    }
+
+    Value* PacketBuilder::JOIN_16(Value* a, Value* b)
+    {
+        return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
+    Value* PacketBuilder::MASK(Value* vmask)
+    {
+        Value* src = BITCAST(vmask, mSimdInt32Ty);
+        return ICMP_SLT(src, VIMMED1(0));
+    }
+
+    Value* PacketBuilder::MASK_16(Value* vmask)
+    {
+        Value* src = BITCAST(vmask, mSimd16Int32Ty);
+        return ICMP_SLT(src, VIMMED1_16(0));
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
+    Value* PacketBuilder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); }
+
+    Value* PacketBuilder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); }
+
+    /// @brief Convert <Nxi1> llvm mask to integer
+    Value* PacketBuilder::VMOVMSK(Value* mask)
+    {
+        assert(mask->getType()->getVectorElementType() == mInt1Ty);
+        uint32_t numLanes = mask->getType()->getVectorNumElements();
+        Value*   i32Result;
+        if (numLanes == 8)
+        {
+            i32Result = BITCAST(mask, mInt8Ty);
+        }
+        else if (numLanes == 16)
+        {
+            i32Result = BITCAST(mask, mInt16Ty);
+        }
+        else
+        {
+            assert("Unsupported vector width");
+            i32Result = BITCAST(mask, mInt8Ty);
+        }
+        return Z_EXT(i32Result, mInt32Ty);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
+    /// supported on the underlying platform, emulate it
+    /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
+    /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
+    /// Byte masks in lower 128 lane of b selects 8 bit values from lower
+    /// 128bits of a, and vice versa for the upper lanes.  If the mask
+    /// value is negative, '0' is inserted.
+    Value* PacketBuilder::PSHUFB(Value* a, Value* b)
+    {
+        Value* res;
+        Constant* cB = dyn_cast<Constant>(b);
+        assert(cB);
+        // number of 8 bit elements in b
+        uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
+        // output vector
+        Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
+
+        // insert an 8 bit value from the high and low lanes of a per loop iteration
+        numElms /= 2;
+        for (uint32_t i = 0; i < numElms; i++)
+        {
+            ConstantInt* cLow128b  = cast<ConstantInt>(cB->getAggregateElement(i));
+            ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
+
+            // extract values from constant mask
+            char valLow128bLane  = (char)(cLow128b->getSExtValue());
+            char valHigh128bLane = (char)(cHigh128b->getSExtValue());
+
+            Value* insertValLow128b;
+            Value* insertValHigh128b;
+
+            // if the mask value is negative, insert a '0' in the respective output position
+            // otherwise, lookup the value at mask position (bits 3..0 of the respective mask
+            // byte) in a and insert in output vector
+            insertValLow128b =
+                (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
+            insertValHigh128b = (valHigh128bLane < 0)
+                                    ? C((char)0)
+                                    : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
+
+            vShuf = VINSERT(vShuf, insertValLow128b, i);
+            vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
+        }
+        res = vShuf;
+        return res;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
+    /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
+    /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
+    /// lower 8 values are used.
+    Value* PacketBuilder::PMOVSXBD(Value* a)
+    {
+        // VPMOVSXBD output type
+        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+        // Extract 8 values from 128bit lane and sign extend
+        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
+    /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
+    /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
+    Value* PacketBuilder::PMOVSXWD(Value* a)
+    {
+        // VPMOVSXWD output type
+        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+        // Extract 8 values from 128bit lane and sign extend
+        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+    }
+
+    Value* PacketBuilder::PMAXSD(Value* a, Value* b)
+    {
+        Value* cmp = ICMP_SGT(a, b);
+        return SELECT(cmp, a, b);
+    }
+
+    Value* PacketBuilder::PMINSD(Value* a, Value* b)
+    {
+        Value* cmp = ICMP_SLT(a, b);
+        return SELECT(cmp, a, b);
+    }
+
+    Value* PacketBuilder::PMAXUD(Value* a, Value* b)
+    {
+        Value* cmp = ICMP_UGT(a, b);
+        return SELECT(cmp, a, b);
+    }
+
+    Value* PacketBuilder::PMINUD(Value* a, Value* b)
+    {
+        Value* cmp = ICMP_ULT(a, b);
+        return SELECT(cmp, a, b);
+    }
+
+    // Helper function to create alloca in entry block of function
+    Value* PacketBuilder::CreateEntryAlloca(Function* pFunc, Type* pType)
+    {
+        auto saveIP = IRB()->saveIP();
+        IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
+        Value* pAlloca = ALLOCA(pType);
+        if (saveIP.isSet())
+            IRB()->restoreIP(saveIP);
+        return pAlloca;
+    }
+
+    Value* PacketBuilder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
+    {
+        auto saveIP = IRB()->saveIP();
+        IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
+        Value* pAlloca = ALLOCA(pType, pArraySize);
+        if (saveIP.isSet())
+            IRB()->restoreIP(saveIP);
+        return pAlloca;
+    }
+
+    Value* PacketBuilder::VABSPS(Value* a)
+    {
+        Value* asInt  = BITCAST(a, mSimdInt32Ty);
+        Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
+        return result;
+    }
+
+    Value* PacketBuilder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
+    {
+        Value* lowCmp = ICMP_SLT(src, low);
+        Value* ret    = SELECT(lowCmp, low, src);
+
+        Value* highCmp = ICMP_SGT(ret, high);
+        ret            = SELECT(highCmp, high, ret, name);
+
+        return ret;
+    }
+
+    Value* PacketBuilder::FCLAMP(Value* src, Value* low, Value* high)
+    {
+        Value* lowCmp = FCMP_OLT(src, low);
+        Value* ret    = SELECT(lowCmp, low, src);
+
+        Value* highCmp = FCMP_OGT(ret, high);
+        ret            = SELECT(highCmp, high, ret);
+
+        return ret;
+    }
+
+    Value* PacketBuilder::FCLAMP(Value* src, float low, float high)
+    {
+        Value* result = VMAXPS(src, VIMMED1(low));
+        result        = VMINPS(result, VIMMED1(high));
+
+        return result;
+    }
+
+    Value* PacketBuilder::FMADDPS(Value* a, Value* b, Value* c)
+    {
+        Value* vOut;
+
+        vOut = FADD(FMUL(a, b), c);
+        return vOut;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief pop count on vector mask (e.g. <8 x i1>)
+    Value* PacketBuilder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief C functions called by LLVM IR
+    //////////////////////////////////////////////////////////////////////////
+
+    Value* PacketBuilder::VEXTRACTI128(Value* a, Constant* imm8)
+    {
+        bool                      flag = !imm8->isZeroValue();
+        SmallVector<Constant*, 8> idx;
+        for (unsigned i = 0; i < mVWidth / 2; i++)
+        {
+            idx.push_back(C(flag ? i + mVWidth / 2 : i));
+        }
+        return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
+    }
+
+    Value* PacketBuilder::VINSERTI128(Value* a, Value* b, Constant* imm8)
+    {
+        bool                      flag = !imm8->isZeroValue();
+        SmallVector<Constant*, 8> idx;
+        for (unsigned i = 0; i < mVWidth; i++)
+        {
+            idx.push_back(C(i));
+        }
+        Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
+
+        SmallVector<Constant*, 8> idx2;
+        for (unsigned i = 0; i < mVWidth / 2; i++)
+        {
+            idx2.push_back(C(flag ? i : i + mVWidth));
+        }
+        for (unsigned i = mVWidth / 2; i < mVWidth; i++)
+        {
+            idx2.push_back(C(flag ? i + mVWidth / 2 : i));
+        }
+        return VSHUFFLE(a, inter, ConstantVector::get(idx2));
+    }
+
+    uint32_t PacketBuilder::GetTypeSize(Type* pType)
+    {
+        if (pType->isStructTy())
+        {
+            uint32_t numElems = pType->getStructNumElements();
+            Type*    pElemTy  = pType->getStructElementType(0);
+            return numElems * GetTypeSize(pElemTy);
+        }
+
+        if (pType->isArrayTy())
+        {
+            uint32_t numElems = pType->getArrayNumElements();
+            Type*    pElemTy  = pType->getArrayElementType();
+            return numElems * GetTypeSize(pElemTy);
+        }
+
+        if (pType->isIntegerTy())
+        {
+            uint32_t bitSize = pType->getIntegerBitWidth();
+            return bitSize / 8;
+        }
+
+        if (pType->isFloatTy())
+        {
+            return 4;
+        }
+
+        if (pType->isHalfTy())
+        {
+            return 2;
+        }
+
+        if (pType->isDoubleTy())
+        {
+            return 8;
+        }
+
+        assert(false && "Unimplemented type.");
+        return 0;
+    }
+}
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/README.md b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/README.md
new file mode 100644
index 000000000000..05580087b002
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/README.md
@@ -0,0 +1 @@
+# Packetizer
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.cpp
new file mode 100644
index 000000000000..d33b97707eb5
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.cpp
@@ -0,0 +1,900 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "WIAnalysis.hpp"
+
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/Debug.h>
+
+#include "llvmWrapper/IR/InstrTypes.h"
+
+
+#include <sstream>
+#include <stack>
+#include <string>
+
+using namespace llvm;
+
+static cl::opt<bool> PrintWiaCheck("print-wia-check", cl::init(true),
+                                   cl::Hidden,
+                                   cl::desc("Debug wia-check analysis"));
+
+namespace pktz {
+
+WIAnalysis::WIAnalysis() : FunctionPass(ID) {
+  initializeWIAnalysisPass(*PassRegistry::getPassRegistry());
+}
+
+const unsigned int WIAnalysis::MinIndexBitwidthToPreserve = 16;
+
+void WIAnalysis::print(raw_ostream &OS, const Module *) const {
+  DenseMap<BasicBlock *, int> BBIDs;
+  int id = 0;
+  for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E;
+       ++I, ++id) {
+    BasicBlock *BB = &*I;
+    BBIDs[BB] = id;
+  }
+
+  OS << "WIAnalysis: " << m_func->getName().str() << "\n";
+
+  OS << "Args: \n";
+  for (Function::arg_iterator I = m_func->arg_begin(), E = m_func->arg_end();
+       I != E; ++I) {
+    Value *AVal = &*I;
+    DenseMap<const Value *, WIDependancy>::const_iterator dep_it =
+        m_deps.find(AVal);
+    if (dep_it != m_deps.end())
+      OS << "    " << "STRIDE:" << dep_it->second << " " << *AVal << "\n";
+    else
+      OS << "  unknown " << *AVal << "\n";
+  }
+  OS << "\n";
+
+  for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E; ++I) {
+    BasicBlock *BB = &*I;
+    OS << "BB:" << BBIDs[BB];
+    if (BB->hasName())
+      OS << " " << BB->getName();
+    OS << "       ; preds =";
+    bool isFirst = true;
+    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+      BasicBlock *pred = *PI;
+      OS << ((isFirst) ? " " : ", ") << "BB:" << BBIDs[pred] << "  ";
+      if (pred->hasName())
+        OS << pred->getName();
+      isFirst = false;
+    }
+    OS << "\n";
+    for (BasicBlock::iterator it = BB->begin(), ie = BB->end(); it != ie;
+         ++it) {
+      Instruction *I = &*it;
+      DenseMap<const Value *, WIDependancy>::const_iterator dep_it =
+          m_deps.find(I);
+      if (dep_it != m_deps.end()) {
+        OS << "  " << "STRIDE:" << dep_it->second << " " << *I;
+      } else {
+        OS << "  unknown " << *I;
+      }
+      if (I->isTerminator()) {
+        auto TI = cast<IGCLLVM::TerminatorInst>(I);
+        OS << " [";
+        for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
+          BasicBlock *succ = TI->getSuccessor(i);
+          OS << " BB:" << BBIDs[succ];
+        }
+        OS << " ]";
+      }
+      OS << "\n";
+    }
+    OS << "\n";
+  }
+}
+
+bool WIAnalysis::runOnFunction(Function &F) {
+
+  if (!F.hasFnAttribute("CMGenxSIMT"))
+    return false;
+  m_func = &F;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+
+  m_deps.clear();
+  m_changed1.clear();
+  m_changed2.clear();
+  m_pChangedNew = &m_changed1;
+  m_pChangedOld = &m_changed2;
+  m_ctrlBranches.clear();
+
+  initDependency(&F);
+
+  inst_iterator it = inst_begin(F);
+  inst_iterator e = inst_end(F);
+  for (; it != e; ++it) {
+    calculate_dep(&*it);
+  }
+
+  // Recursively check if WI-dep changes and if so reclaculates
+  // the WI-dep and marks the users for re-checking.
+  // This procedure is guranteed to converge since WI-dep can only
+  // become less unifrom (uniform->consecutive->ptr->stride->random).
+  updateDeps();
+  
+  if (PrintWiaCheck) {
+    print(dbgs());
+  }
+  return false;
+}
+
+void WIAnalysis::updateDeps() {
+  // As lonst as we have values to update
+  while (!m_pChangedNew->empty()) {
+    // swap between changedSet pointers - recheck the newChanged(now old)
+    std::swap(m_pChangedNew, m_pChangedOld);
+    // clear the newChanged set so it will be filled with the users of
+    // instruction which their WI-dep canged during the current iteration
+    m_pChangedNew->clear();
+
+    // update all changed values
+    std::vector<const Value *>::iterator it = m_pChangedOld->begin();
+    std::vector<const Value *>::iterator e = m_pChangedOld->end();
+    for (; it != e; ++it) {
+      // remove first instruction
+      // calculate its new dependencey value
+      calculate_dep(*it);
+    }
+  }
+}
+
+bool WIAnalysis::isInstructionSimple(const Instruction *inst) {
+  // avoid changing cb load to sampler load, since sampler load
+  // has longer latency.
+  if (isa<LoadInst>(inst)) {
+    return false;
+  }
+
+  if (isa<UnaryInstruction>(inst) || isa<BinaryOperator>(inst) ||
+      isa<CmpInst>(inst) || isa<SelectInst>(inst)) {
+    return true;
+  }
+  return false;
+}
+
+void WIAnalysis::initDependency(llvm::Function *pF) {
+  llvm::Function::arg_iterator ai, ae;
+  ai = pF->arg_begin();
+  ae = pF->arg_end();
+
+  // add all kernel function args as uniform
+  for (; ai != ae; ++ai) {
+    incUpdateDepend(ai, WIAnalysis::UNIFORM);
+  }
+}
+
+bool WIAnalysis::validDepend(const llvm::Value *val) {
+  return (m_deps.find(val) != m_deps.end());
+}
+
+WIAnalysis::WIDependancy WIAnalysis::whichDepend(const Value *val) {
+  assert(m_pChangedNew->empty() && "set should be empty before query");
+  assert(val && "Bad value");
+  if (m_deps.find(val) == m_deps.end()) {
+    // We expect all instructions in the map. Otherwise take the safe
+    // way return random on release (assert on debug). For non-instruction
+    // (arguments, constants) return uniform.
+    bool isInst = isa<Instruction>(val);
+    if (isInst) {
+      return WIAnalysis::RANDOM;
+    }
+    return WIAnalysis::UNIFORM;
+  }
+  return m_deps[val];
+}
+
+bool WIAnalysis::stayUniformIfUsedAt(const Value *val, BasicBlock *use_blk) {
+  const Instruction *inst = dyn_cast<Instruction>(val);
+  // if it is a function argument, no problem to use it anywhere inside the
+  // function
+  if (!inst) {
+    return true;
+  }
+  if (m_deps.find(inst) == m_deps.end()) {
+    assert(0 && "trouble, don't have a record");
+    return true;
+  }
+  if (m_deps[inst] != WIAnalysis::UNIFORM) {
+    return true;
+  }
+  const BasicBlock *def_blk = inst->getParent();
+  if (m_ctrlBranches.find(def_blk) == m_ctrlBranches.end()) {
+    return true;
+  }
+  if (m_ctrlBranches.find(use_blk) != m_ctrlBranches.end()) {
+    return false;
+  }
+  // every controlling branch of the def block has to be in the set of
+  // controlling branches for the use-blk
+  for (SmallPtrSet<const Instruction *, 4>::iterator
+           I = m_ctrlBranches[def_blk].begin(),
+           E = m_ctrlBranches[def_blk].end();
+       I != E; ++I) {
+    if (!m_ctrlBranches[use_blk].count(*I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void WIAnalysis::invalidateDepend(const Value *val) {
+  if (m_deps.find(val) != m_deps.end()) {
+    m_deps.erase(val);
+  }
+}
+
+bool WIAnalysis::isControlFlowUniform(const Function *F) {
+  assert(F && "Bad Function");
+
+  /// Place out-masks
+  for (Function::const_iterator it = F->begin(), e = F->end(); it != e; ++it) {
+    WIAnalysis::WIDependancy dep = whichDepend(it->getTerminator());
+    if (dep != WIAnalysis::UNIFORM) {
+      // Found a branch which diverges on the input
+      return false;
+    }
+  }
+  // All branches are uniform
+  return true;
+}
+
+WIAnalysis::WIDependancy WIAnalysis::getDependency(const Value *val) {
+
+  if (m_deps.find(val) == m_deps.end()) {
+    // Make sure that constants are not added in the map.
+    if (!isa<Instruction>(val)) {
+      return WIAnalysis::UNIFORM;
+    }
+    // Don't expect this happens, let's assert in debug build!
+    assert(false && "Dependence for 'val' should bave been set already!");
+    m_deps[val] = WIAnalysis::UNIFORM;
+  }
+  return m_deps[val];
+}
+
+bool WIAnalysis::hasDependency(const Value *val) {
+
+  if (!isa<Instruction>(val) && !isa<Argument>(val)) {
+    return true;
+  }
+  return (m_deps.count(val) > 0);
+}
+
+void WIAnalysis::calculate_dep(const Value *val) {
+  assert(val && "Bad value");
+
+  // Not an instruction, must be a constant or an argument
+  // Could this vector type be of a constant which
+  // is not uniform ?
+  assert(isa<Instruction>(val) &&
+         "Could we reach here with non instruction value?");
+
+  const Instruction *inst = dyn_cast<Instruction>(val);
+  assert(inst && "This Value is not an Instruction");
+
+  bool hasOriginal = hasDependency(inst);
+  WIDependancy orig;
+  // We only calculate dependency on unset instructions if all their operands
+  // were already given dependency. This is good for compile time since these
+  // instructions will be visited again after the operands dependency is set.
+  // An exception are phi nodes since they can be the ancestor of themselves in
+  // the def-use chain. Note that in this case we force the phi to have the
+  // pre-header value already calculated.
+  if (!hasOriginal) {
+    unsigned int unsetOpNum = 0;
+    for (unsigned i = 0; i < inst->getNumOperands(); ++i) {
+      if (!hasDependency(inst->getOperand(i)))
+        unsetOpNum++;
+    }
+    if (isa<PHINode>(inst)) {
+      // We do not calculate PhiNode with all incoming values unset.
+      //
+      // This seems right as we don't expect a phi that only depends upon other
+      // phi's (if it happens, those phis form a cycle dependency) so any phi's
+      // calculation will eventually be triggered from calculating a non-phi one
+      // which the phi depends upon.
+      if (unsetOpNum == inst->getNumOperands())
+        return;
+    } else {
+      // We do not calculate non-PhiNode instruction that have unset operands
+      if (unsetOpNum > 0)
+        return;
+    }
+    orig = WIAnalysis::UNIFORM;
+  } else {
+    orig = m_deps[inst];
+    // if inst is already marked random, it cannot get better
+    if (orig == WIAnalysis::RANDOM) {
+      return;
+    }
+  }
+
+  WIDependancy dep = orig;
+
+  // LLVM does not have compile time polymorphisms
+  // TODO: to make things faster we may want to sort the list below according
+  // to the order of their probability of appearance.
+  if (const BinaryOperator *BI = dyn_cast<BinaryOperator>(inst))
+    dep = calculate_dep(BI);
+  else if (const CallInst *CI = dyn_cast<CallInst>(inst))
+    dep = calculate_dep(CI);
+  else if (isa<CmpInst>(inst))
+    dep = calculate_dep_simple(inst);
+  else if (isa<ExtractElementInst>(inst))
+    dep = calculate_dep_simple(inst);
+  else if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(inst))
+    dep = calculate_dep(GEP);
+  else if (isa<InsertElementInst>(inst))
+    dep = calculate_dep_simple(inst);
+  else if (isa<InsertValueInst>(inst))
+    dep = calculate_dep_simple(inst);
+  else if (const PHINode *Phi = dyn_cast<PHINode>(inst))
+    dep = calculate_dep(Phi);
+  else if (isa<ShuffleVectorInst>(inst))
+    dep = calculate_dep_simple(inst);
+  else if (isa<StoreInst>(inst))
+    dep = RANDOM; // calculate_dep_simple(inst);
+  else if (inst->isTerminator())
+    dep = calculate_dep(inst);
+  else if (const SelectInst *SI = dyn_cast<SelectInst>(inst))
+    dep = calculate_dep(SI);
+  else if (const AllocaInst *AI = dyn_cast<AllocaInst>(inst))
+    dep = calculate_dep(AI);
+  else if (const CastInst *CI = dyn_cast<CastInst>(inst))
+    dep = calculate_dep(CI);
+  else if (isa<ExtractValueInst>(inst))
+    dep = calculate_dep_simple(inst);
+  else if (const LoadInst *LI = dyn_cast<LoadInst>(inst))
+    dep = calculate_dep(LI);
+  else if (const VAArgInst *VAI = dyn_cast<VAArgInst>(inst))
+    dep = calculate_dep(VAI);
+
+  // If the value was changed in this calculation
+  if (!hasOriginal || dep != orig) {
+    // Save the new value of this instruction
+    updateDepMap(inst, dep);
+    // divergent branch, trigger updates due to control-dependence
+    if (inst->isTerminator() && dep != WIAnalysis::UNIFORM) {
+      update_cf_dep(inst);
+    }
+  }
+}
+
+void WIAnalysis::update_cf_dep(const Instruction *inst) {
+  BasicBlock *blk = const_cast<BasicBlock *>(inst->getParent());
+  BasicBlock *ipd = PDT->getNode(blk)->getIDom()->getBlock();
+  // a branch can have NULL immediate post-dominator when a function
+  // has multiple exits in llvm-ir
+  // compute influence region and the partial-joins
+  assert(inst->isTerminator() && "Expected terminator inst");
+  BranchInfo br_info(cast<IGCLLVM::TerminatorInst>(inst), ipd);
+  // debug: dump influence region and partial-joins
+  // br_info.print(ods());
+
+  // check dep-type for every phi in the full join
+  if (ipd) {
+    updatePHIDepAtJoin(ipd, &br_info);
+  }
+  // check dep-type for every phi in the partial-joins
+  for (SmallPtrSet<BasicBlock *, 4>::iterator
+           join_it = br_info.partial_joins.begin(),
+           join_e = br_info.partial_joins.end();
+       join_it != join_e; ++join_it) {
+    updatePHIDepAtJoin(*join_it, &br_info);
+  }
+
+  // walk through all the instructions in the influence-region
+  // update the dep-type based upon its uses
+  DenseSet<BasicBlock *>::iterator blk_it = br_info.influence_region.begin();
+  DenseSet<BasicBlock *>::iterator blk_e = br_info.influence_region.end();
+  for (; blk_it != blk_e; ++blk_it) {
+    BasicBlock *def_blk = *blk_it;
+    // add the branch into the controlling-branch set of the block
+    // if the block is in the influence-region, and not a partial join
+    bool is_join = (br_info.partial_joins.count(def_blk) > 0);
+    if (!is_join) {
+      m_ctrlBranches[def_blk].insert(inst);
+    }
+    // An insight that can speed up the search process is that all the in-region
+    // values that are used outside must dominate TI. Therefore, instead of
+    // searching every basic blocks in the influence region, we only search the
+    // dominators of the current branch
+    if (def_blk != blk &&
+        !DT->dominates(DT->getNode(def_blk), DT->getNode(blk))) {
+      continue;
+    }
+    for (BasicBlock::iterator I = def_blk->begin(), E = def_blk->end(); I != E;
+         ++I) {
+      Instruction *defi = &(*I);
+      if (hasDependency(defi) && getDependency(defi) == WIAnalysis::RANDOM) {
+        continue;
+      }
+      // look at the uses
+      Value::use_iterator use_it = defi->use_begin();
+      Value::use_iterator use_e = defi->use_end();
+      for (; use_it != use_e; ++use_it) {
+        Instruction *user = dyn_cast<Instruction>((*use_it).getUser());
+        assert(user);
+        BasicBlock *user_blk = user->getParent();
+        PHINode *phi = dyn_cast<PHINode>(user);
+        if (phi) {
+          // another place we assume all critical edges have been split and
+          // phi-move will be placed on the blocks created on those
+          user_blk = phi->getIncomingBlock(*use_it);
+        }
+        if (user_blk == def_blk) {
+          // local def-use, not related to control-dependence
+          continue; // skip
+        }
+        if (user_blk == br_info.full_join ||
+            br_info.partial_joins.count(user_blk) ||
+            !br_info.influence_region.count(user_blk)) {
+          updateDepMap(defi, WIAnalysis::RANDOM);
+          // break out of the use loop
+          // since def is changed to RANDOM, all uses will be changed later
+          break;
+        }
+      } // end of usei loop
+    }   // end of defi loop within a block
+  }     // end of influence-region block loop
+}
+
+void WIAnalysis::updatePHIDepAtJoin(BasicBlock *blk, BranchInfo *brInfo) {
+  for (BasicBlock::iterator I = blk->begin(), E = blk->end(); I != E; ++I) {
+    Instruction *defi = &(*I);
+    PHINode *phi = dyn_cast<PHINode>(defi);
+    if (!phi) {
+      break;
+    }
+    if (hasDependency(phi) && getDependency(phi) == WIAnalysis::RANDOM) {
+      continue;
+    }
+    Value *trickySrc = nullptr;
+    for (unsigned predIdx = 0; predIdx < phi->getNumOperands(); ++predIdx) {
+      Value *srcVal = phi->getOperand(predIdx);
+      Instruction *defi = dyn_cast<Instruction>(srcVal);
+      if (defi && brInfo->influence_region.count(defi->getParent())) {
+        updateDepMap(phi, WIAnalysis::RANDOM);
+        break;
+      } else {
+        // if the src is an immed, or an argument, or defined outside,
+        // think about the phi-move that can be placed in the incoming block.
+        // this phi should be random if we have two different src-values like
+        // that. this is one place where we assume all critical edges have been
+        // split
+        BasicBlock *predBlk = phi->getIncomingBlock(predIdx);
+        if (brInfo->influence_region.count(predBlk)) {
+          if (!trickySrc) {
+            trickySrc = srcVal;
+          } else if (trickySrc != srcVal) {
+            updateDepMap(phi, WIAnalysis::RANDOM);
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+void WIAnalysis::updateDepMap(const Instruction *inst,
+                              WIAnalysis::WIDependancy dep) {
+  // Save the new value of this instruction
+  m_deps[inst] = dep;
+  // Register for update all of the dependent values of this updated
+  // instruction.
+  Value::const_user_iterator it = inst->user_begin();
+  Value::const_user_iterator e = inst->user_end();
+  for (; it != e; ++it) {
+    m_pChangedNew->push_back(*it);
+  }
+}
+
+WIAnalysis::WIDependancy
+WIAnalysis::calculate_dep_simple(const Instruction *I) {
+  // simply check that all operands are uniform, if so return uniform, else
+  // random
+  const unsigned nOps = I->getNumOperands();
+  for (unsigned i = 0; i < nOps; ++i) {
+    const Value *op = I->getOperand(i);
+    WIAnalysis::WIDependancy dep = getDependency(op);
+    if (dep != WIAnalysis::UNIFORM) {
+      return WIAnalysis::RANDOM;
+    }
+  }
+  return WIAnalysis::UNIFORM;
+}
+
+WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const LoadInst *inst) {
+  return calculate_dep_simple(inst);
+}
+
+WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const BinaryOperator *inst) {
+  // Calculate the dependency type for each of the operands
+  Value *op0 = inst->getOperand(0);
+  Value *op1 = inst->getOperand(1);
+
+  WIAnalysis::WIDependancy dep0 = getDependency(op0);
+  WIAnalysis::WIDependancy dep1 = getDependency(op1);
+
+  // For whatever binary operation,
+  // uniform returns uniform
+  if (WIAnalysis::UNIFORM == dep0 && WIAnalysis::UNIFORM == dep1) {
+    return WIAnalysis::UNIFORM;
+  }
+
+  // FIXME:: assumes that the X value does not cross the +/- border - risky !!!
+  // The pattern (and (X, C)), where C preserves the lower k bits of the value,
+  // is often used for truncating of numbers in 64bit. We assume that the index
+  // properties are not hurt by this.
+  if (inst->getOpcode() == Instruction::And) {
+    ConstantInt *C0 = dyn_cast<ConstantInt>(inst->getOperand(0));
+    ConstantInt *C1 = dyn_cast<ConstantInt>(inst->getOperand(1));
+    // Use any of the constants. Instcombine places constants on Op1
+    // so try Op1 first.
+    if (C1 || C0) {
+      ConstantInt *C = C1 ? C1 : C0;
+      WIAnalysis::WIDependancy dep = C1 ? dep0 : dep1;
+      // Cannot look at bit pattern of huge integers.
+      if (C->getBitWidth() < 65) {
+        uint64_t val = C->getZExtValue();
+        uint64_t ptr_mask = (1 << MinIndexBitwidthToPreserve) - 1;
+        // Zero all bits above the lower k bits that we are interested in
+        val &= (ptr_mask);
+        // Make sure that all of the remaining bits are active
+        if (val == ptr_mask) {
+          return dep;
+        }
+      }
+    }
+  }
+
+  // FIXME:: assumes that the X value does not cross the +/- border - risky !!!
+  // The pattern (ashr (shl X, C)C) is used for truncating of numbers in 64bit
+  // The constant C must leave at least 32bits of the original number
+  if (inst->getOpcode() == Instruction::AShr) {
+    BinaryOperator *SHL = dyn_cast<BinaryOperator>(inst->getOperand(0));
+    // We also allow add of uniform value between the ashr and shl instructions
+    // since instcombine creates this pattern when adding a constant.
+    // The shl forces all low bits to be zero, so there can be no carry to the
+    // high bits due to the addition. Addition with uniform preservs WI-dep.
+    if (SHL && SHL->getOpcode() == Instruction::Add) {
+      Value *addedVal = SHL->getOperand(1);
+      if (getDependency(addedVal) == WIAnalysis::UNIFORM) {
+        SHL = dyn_cast<BinaryOperator>(SHL->getOperand(0));
+      }
+    }
+
+    if (SHL && SHL->getOpcode() == Instruction::Shl) {
+      ConstantInt *c_ashr = dyn_cast<ConstantInt>(inst->getOperand(1));
+      ConstantInt *c_shl = dyn_cast<ConstantInt>(SHL->getOperand(1));
+      const IntegerType *AshrTy = cast<IntegerType>(inst->getType());
+      if (c_ashr && c_shl && c_ashr->getZExtValue() == c_shl->getZExtValue()) {
+        // If wordWidth - shift_width >= 32 bits
+        if ((AshrTy->getBitWidth() - c_shl->getZExtValue()) >=
+            MinIndexBitwidthToPreserve) {
+          // return the dep of the original X
+          return getDependency(SHL->getOperand(0));
+        }
+      }
+    }
+  }
+
+  if (dep0 == WIAnalysis::RANDOM || dep1 == WIAnalysis::RANDOM) {
+    return WIAnalysis::RANDOM;
+  }
+  // stride computation
+  switch (inst->getOpcode()) {
+    // Addition simply adds the stride value, except for ptr_consecutive
+    // which is promoted to strided.
+    // Another exception is when we subtract the tid: 1 - X which turns the
+    // tid order to random.
+  case Instruction::Add: {
+    int stride = dep0 + dep1;
+    return clampDepend(stride);
+  }
+  case Instruction::Sub: {
+    int stride = dep0 - dep1;
+    return clampDepend(stride);
+  }
+  case Instruction::Mul:
+    if (const ConstantInt* ConstOpnd = dyn_cast<ConstantInt>(op0)) {
+      const int c = (int)ConstOpnd->getSExtValue();
+      return clampDepend(c*dep1);
+    }
+    else if (const ConstantInt* ConstOpnd = dyn_cast<ConstantInt>(op1)) {
+      const int c = (int)ConstOpnd->getSExtValue();
+      return clampDepend(c*dep0);
+    }
+    break;
+  case Instruction::Shl:
+    if (const ConstantInt* ConstOpnd = dyn_cast<ConstantInt>(op1)) {
+      const int c = (int)ConstOpnd->getSExtValue();
+      return clampDepend(dep0<<c);
+    }
+    break;
+  default:
+    // TODO: Support more arithmetic if needed
+    return WIAnalysis::RANDOM;
+  }
+  return WIAnalysis::RANDOM;
+}
+
+WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const CallInst *inst) {
+  if (Function *Callee = inst->getCalledFunction()) {
+    switch (GenXIntrinsic::getGenXIntrinsicID(Callee)) {
+    case GenXIntrinsic::genx_lane_id:
+      return (WIAnalysis::WIDependancy)1;
+    default:
+      break;
+    }
+  }
+
+  return WIAnalysis::RANDOM;
+}
+
+WIAnalysis::WIDependancy
+WIAnalysis::calculate_dep(const GetElementPtrInst *inst) {
+  // running over the all indices argumets except for the last
+  // here we assume the pointer is the first operand
+  unsigned num = inst->getNumIndices();
+  for (unsigned i = 1; i < num; ++i) {
+    const Value *op = inst->getOperand(i);
+    WIAnalysis::WIDependancy dep = getDependency(op);
+    if (dep != WIAnalysis::UNIFORM) {
+      return WIAnalysis::RANDOM;
+    }
+  }
+  const Value *opPtr = inst->getOperand(0);
+  WIAnalysis::WIDependancy ptrDep = getDependency(opPtr);
+
+  const Value *lastInd = inst->getOperand(num);
+  WIAnalysis::WIDependancy lastIndDep = getDependency(lastInd);
+  // \todo
+  return clampDepend((int)ptrDep + (int)lastIndDep);
+}
+
+WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const PHINode *inst) {
+  unsigned num = inst->getNumIncomingValues();
+  bool foundFirst = 0;
+  WIDependancy totalDep;
+
+  for (unsigned i = 0; i < num; ++i) {
+    Value *op = inst->getIncomingValue(i);
+    if (hasDependency(op)) {
+      if (!foundFirst) {
+        totalDep = getDependency(op);
+      } else if (totalDep != getDependency(op)) {
+        totalDep = WIAnalysis::RANDOM;
+      }
+      foundFirst = 1;
+    }
+  }
+
+  assert(foundFirst &&
+         "We should not reach here with All incoming values are unset");
+
+  return totalDep;
+}
+
+WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const Instruction *inst) {
+  // Instruction has no return value
+  // Just need to know if this inst is uniform or not
+  // because we may want to avoid predication if the control flows
+  // in the function are uniform...
+  switch (inst->getOpcode()) {
+  case Instruction::Br: {
+    const BranchInst *brInst = cast<BranchInst>(inst);
+    if (brInst->isConditional()) {
+      // Conditional branch is uniform, if its condition is uniform
+      Value *op = brInst->getCondition();
+      WIAnalysis::WIDependancy dep = getDependency(op);
+      if (WIAnalysis::UNIFORM == dep) {
+        return WIAnalysis::UNIFORM;
+      }
+      return WIAnalysis::RANDOM;
+    }
+    // Unconditional branch is non TID-dependent
+    return WIAnalysis::UNIFORM;
+  }
+  // Return instructions are unconditional
+  case Instruction::Ret:
+    return WIAnalysis::UNIFORM;
+  case Instruction::Unreachable:
+    return WIAnalysis::UNIFORM;
+  case Instruction::IndirectBr:
+    return WIAnalysis::RANDOM;
+    // TODO: Define the dependency requirements of indirectBr
+  case Instruction::Switch:
+    return WIAnalysis::RANDOM;
+    // TODO: Should this depend only on the condition, like branch?
+  default:
+    return WIAnalysis::RANDOM;
+  }
+}
+
+WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const SelectInst *inst) {
+  Value *op0 = inst->getOperand(0); // mask
+  WIAnalysis::WIDependancy dep0 = getDependency(op0);
+  if (WIAnalysis::UNIFORM == dep0) {
+    Value *op1 = inst->getOperand(1);
+    Value *op2 = inst->getOperand(2);
+    WIAnalysis::WIDependancy dep1 = getDependency(op1);
+    WIAnalysis::WIDependancy dep2 = getDependency(op2);
+    if (dep1 == dep2)
+      return dep1;
+  }
+  return WIAnalysis::RANDOM;
+}
+
+WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const AllocaInst *inst) {
+  // \todo
+  return WIAnalysis::RANDOM;
+}
+
+WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const CastInst *inst) {
+  Value *op0 = inst->getOperand(0);
+  WIAnalysis::WIDependancy dep0 = getDependency(op0);
+
+  // independent remains independent
+  if (WIAnalysis::UNIFORM == dep0)
+    return dep0;
+
+  switch (inst->getOpcode()) {
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::AddrSpaceCast:
+  case Instruction::UIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::SIToFP:
+    return dep0;
+  case Instruction::BitCast:
+  case Instruction::ZExt:
+    return WIAnalysis::RANDOM;
+    // FIXME:: assumes that the value does not cross the +/- border - risky !!!!
+  case Instruction::Trunc: {
+    const Type *destType = inst->getDestTy();
+    const IntegerType *intType = dyn_cast<IntegerType>(destType);
+    if (intType && (intType->getBitWidth() >= MinIndexBitwidthToPreserve)) {
+      return dep0;
+    }
+    return WIAnalysis::RANDOM;
+  }
+  default:
+    assert(false && "no such opcode");
+    // never get here
+    return WIAnalysis::RANDOM;
+  }
+}
+
+WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const VAArgInst *inst) {
+  assert(false && "Are we supporting this ??");
+  return WIAnalysis::RANDOM;
+}
+
+BranchInfo::BranchInfo(const IGCLLVM::TerminatorInst *inst, const BasicBlock *ipd)
+    : cbr(inst), full_join(ipd) {
+  assert(cbr == inst->getParent()->getTerminator() && "block terminator mismatch");
+  assert(cbr->getNumSuccessors() == 2 && "only for cbr with two successors");
+
+  std::set<BasicBlock *> f_set, t_set;
+  std::stack<BasicBlock *> work_set;
+  if (cbr->getSuccessor(0) != full_join) {
+    work_set.push(cbr->getSuccessor(0));
+    while (!work_set.empty()) {
+      BasicBlock *cur_blk = work_set.top();
+      work_set.pop();
+      f_set.insert(cur_blk);
+      influence_region.insert(cur_blk);
+      for (succ_iterator SI = succ_begin(cur_blk), E = succ_end(cur_blk);
+           SI != E; ++SI) {
+        BasicBlock *succ_blk = (*SI);
+        if (succ_blk != full_join && !f_set.count(succ_blk)) {
+          work_set.push(succ_blk);
+        }
+      }
+    }
+  }
+  if (cbr->getSuccessor(1) != full_join) {
+    work_set.push(cbr->getSuccessor(1));
+    while (!work_set.empty()) {
+      BasicBlock *cur_blk = work_set.top();
+      work_set.pop();
+      t_set.insert(cur_blk);
+      influence_region.insert(cur_blk);
+      if (f_set.count(cur_blk)) {
+        partial_joins.insert(cur_blk);
+      }
+      for (succ_iterator SI = succ_begin(cur_blk), E = succ_end(cur_blk);
+           SI != E; ++SI) {
+        BasicBlock *succ_blk = (*SI);
+        if (succ_blk != full_join && !t_set.count(succ_blk)) {
+          work_set.push(succ_blk);
+        }
+      }
+    }
+  }
+}
+
+void BranchInfo::print(raw_ostream &OS) const {
+  OS << "\nCBR: " << *cbr;
+  OS << "\nIPD: ";
+  if (full_join) {
+    full_join->print(OS);
+  }
+  OS << "\nPartial Joins:";
+  SmallPtrSet<BasicBlock *, 4>::iterator join_it = partial_joins.begin();
+  SmallPtrSet<BasicBlock *, 4>::iterator join_e = partial_joins.end();
+  for (; join_it != join_e; ++join_it) {
+    BasicBlock *cur_blk = *join_it;
+    OS << "\n    ";
+    cur_blk->print(OS);
+  }
+  OS << "\nInfluence Region:";
+  DenseSet<BasicBlock *>::const_iterator blk_it = influence_region.begin();
+  DenseSet<BasicBlock *>::const_iterator blk_e = influence_region.end();
+  for (; blk_it != blk_e; ++blk_it) {
+    BasicBlock *cur_blk = *blk_it;
+    OS << "\n    ";
+    cur_blk->print(OS);
+  }
+  OS << "\n";
+}
+
+char WIAnalysis::ID = 0; // LLVM uses address of ID as the actual ID.
+
+FunctionPass *createWIAnalysisPass() { return new WIAnalysis(); }
+
+} // end of namespace pktz
+
+using namespace pktz;
+
+#define PASS_FLAG "wi-analysis"
+#define PASS_DESCRIPTION "WIAnalysis provides work item dependency info"
+#define PASS_CFG_ONLY true
+#define PASS_ANALYSIS true
+INITIALIZE_PASS_BEGIN(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY,
+                      PASS_ANALYSIS)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY,
+                    PASS_ANALYSIS)
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.hpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.hpp
new file mode 100644
index 000000000000..4d0f7fe51879
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.hpp
@@ -0,0 +1,265 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+#pragma once
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallSet.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/InstIterator.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Value.h>
+#include <llvm/Pass.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include "llvmWrapper/IR/InstrTypes.h"
+
+#include <vector>
+
+namespace llvm {
+// foward declare the initializer
+void initializeWIAnalysisPass(PassRegistry &);
+} // namespace llvm
+
+namespace pktz {
+/// @Brief, given a conditional branch and its immediate post dominator,
+/// find its influence-region and partial joins within the influence region
+class BranchInfo {
+public:
+  BranchInfo(const IGCLLVM::TerminatorInst *inst, const llvm::BasicBlock *ipd);
+
+  void print(llvm::raw_ostream &OS) const;
+  void dump() const { print(llvm::dbgs()); }
+
+  const IGCLLVM::TerminatorInst *cbr;
+  const llvm::BasicBlock *full_join;
+  llvm::DenseSet<llvm::BasicBlock *> influence_region;
+  llvm::SmallPtrSet<llvm::BasicBlock *, 4> partial_joins;
+  llvm::BasicBlock *fork_blk;
+};
+
+/// @brief Work Item Analysis class used to provide information on
+///  individual instructions. The analysis class detects values which
+///  depend in work-item and describe their dependency.
+///  The algorithm used is recursive and new instructions are updated
+///  according to their operands (which are already calculated).
+/// @Author: Nadav Rotem, who wrote the original code for OCL vectorizer
+///
+/// @Author: Gang Chen, adopt it for IGC,
+///          - extend it to handle the divergent SIMD control-flow
+///          - support GFX-specific intrinsic
+class WIAnalysis : public llvm::FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  WIAnalysis();
+
+  ~WIAnalysis() {}
+
+  /// @brief LLVM llvm::Function pass entry
+  /// @param F llvm::Function to transform
+  /// @return True if changed
+  virtual bool runOnFunction(llvm::Function &F);
+
+  /// @brief Update dependency relations between all values
+  void updateDeps();
+
+  /// @brief backward update dependency based upon use
+  void backwardUpdate();
+
+  /// @brief initialize value dependence
+  void initDependency(llvm::Function *pF);
+
+  /// @brief describes the type of dependency on the work item
+  enum WIDependancy {
+    UNIFORM = 0,         /// All elements in vector are constant
+    // stride-value between 1 and 1023
+    RANDOM = 1024,        /// if stride >= 1024, treat as random      
+  };
+
+  /// The WIAnalysis follows pointer arithmetic
+  ///  and Index arithmetic when calculating dependency
+  ///  properties. If a part of the index is lost due to
+  ///  a transformation, it is acceptable.
+  ///  This constant decides how many bits need to be
+  ///  preserved before we give up on the analysis.
+  static const unsigned int MinIndexBitwidthToPreserve;
+
+  /// @brief Returns true if the analysis has a dependency
+  //         for the instruction, false otherwise
+  /// @param val llvm::Value to test
+  /// @return Validity of dependency
+  bool validDepend(const llvm::Value *val);
+
+  /// @brief Returns the type of dependency the instruction has on
+  /// the work-item
+  /// @param val llvm::Value to test
+  /// @return Dependency kind
+  WIDependancy whichDepend(const llvm::Value *val);
+
+  /// @brief Inform analysis that instruction was invalidated
+  /// as pointer may later be reused
+  /// @param val llvm::Value to invalidate
+  void invalidateDepend(const llvm::Value *val);
+
+  /// incremental update of the dep-map on individual value
+  /// without propagation. Exposed for later pass.
+  void incUpdateDepend(const llvm::Value *val, WIDependancy dep) {
+    m_deps[val] = dep;
+  }
+
+  /// check if a value stay uniform when we add a use in the given block
+  /// If the value is not uniform to begin with, query returns true.
+  bool stayUniformIfUsedAt(const llvm::Value *val, llvm::BasicBlock *blk);
+
+  /// check if a value is defined inside divergent control-flow
+  bool insideDivergentCF(const llvm::Value *val) {
+    return (
+        llvm::isa<llvm::Instruction>(val) &&
+        m_ctrlBranches.find(llvm::cast<llvm::Instruction>(val)->getParent()) !=
+            m_ctrlBranches.end());
+  }
+
+  /// @brief Checks if all of the control flow in the analyzed function is
+  /// uniform.
+  /// @param F function to check
+  /// @return True if masks are needed
+  bool isControlFlowUniform(const llvm::Function *F);
+
+  virtual void releaseMemory() {
+    m_deps.clear();
+    m_changed1.clear();
+    m_changed2.clear();
+    m_ctrlBranches.clear();
+    m_backwardList.clear();
+  }
+
+  /// print - print m_deps in human readable form
+  virtual void print(llvm::raw_ostream &OS, const llvm::Module * = 0) const;
+  void dump() const { print(llvm::dbgs()); }
+
+private:
+  /*! \name Dependency Calculation Functions
+   *  \{ */
+  /// @brief Calculate the dependency type for the instruction
+  /// @param inst Instruction to inspect
+  /// @return Type of dependency.
+  void calculate_dep(const llvm::Value *val);
+  WIDependancy calculate_dep(const llvm::BinaryOperator *inst);
+  WIDependancy calculate_dep(const llvm::CallInst *inst);
+  WIDependancy calculate_dep(const llvm::GetElementPtrInst *inst);
+  WIDependancy calculate_dep(const llvm::PHINode *inst);
+  WIDependancy calculate_dep(const llvm::Instruction *inst);
+  WIDependancy calculate_dep(const llvm::SelectInst *inst);
+  WIDependancy calculate_dep(const llvm::AllocaInst *inst);
+  WIDependancy calculate_dep(const llvm::CastInst *inst);
+  WIDependancy calculate_dep(const llvm::VAArgInst *inst);
+  WIDependancy calculate_dep(const llvm::LoadInst *inst);
+  /*! \} */
+
+  WIDependancy clampDepend(int stride) {
+    if (stride < 0 || stride >= RANDOM)
+      return RANDOM;
+    return (WIDependancy)stride;
+  }
+  /// @brief do the trivial checking WI-dep
+  /// @param I instruction to check
+  /// @return Dependency type. Returns Uniform if all operands are
+  ///         Uniform, Random otherwise
+  WIDependancy calculate_dep_simple(const llvm::Instruction *I);
+
+  /// @brief update the WI-dep from a divergent branch,
+  ///        affected instructions are added to m_pChangedNew
+  /// @param the divergent branch
+  void update_cf_dep(const llvm::Instruction *TI);
+
+  /// @check phi divergence at a join-blk due to a divergent branch
+  void updatePHIDepAtJoin(llvm::BasicBlock *blk, BranchInfo *brInfo);
+
+  void updateDepMap(const llvm::Instruction *inst,
+                    WIAnalysis::WIDependancy dep);
+
+  /// @brief Provide known dependency type for requested value
+  /// @param val llvm::Value to examine
+  /// @return Dependency type. Returns Uniform for unknown type
+  WIDependancy getDependency(const llvm::Value *val);
+
+  /// @brief return true if there is calculated dependency type for requested
+  /// value
+  /// @param val llvm::Value to examine
+  /// @return true if value has dependency type, false otherwise.
+  bool hasDependency(const llvm::Value *val);
+
+  /// @brief return true if all uses of this value are marked UNIFORM
+  bool allUsesUniform(const llvm::Value *val);
+
+  /// @brief return true is the instruction is simple and making it random is
+  /// cheap
+  bool isInstructionSimple(const llvm::Instruction *inst);
+
+  /// @brief  LLVM Interface
+  /// @param AU Analysis
+  /// WIAnalysis requires dominator and post dominator analysis
+  /// WIAnalysis also requires BreakCriticalEdge because it assumes that
+  /// potential phi-moves will be placed at those blocks
+  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
+    // Analysis pass preserve all
+    AU.setPreservesAll();
+
+    AU.addRequired<llvm::DominatorTreeWrapperPass>();
+    AU.addRequired<llvm::PostDominatorTreeWrapperPass>();
+  }
+
+private:
+  /// Stores an updated list of all dependencies
+  llvm::DenseMap<const llvm::Value *, WIDependancy> m_deps;
+  /// for each block, store the list of diverging branches that affect it
+  llvm::DenseMap<const llvm::BasicBlock *,
+                 llvm::SmallPtrSet<const llvm::Instruction *, 4>>
+      m_ctrlBranches;
+
+  /// Iteratively one set holds the changed from the previous iteration and
+  /// the other holds the new changed values from the current iteration.
+  std::vector<const llvm::Value *> m_changed1;
+  std::vector<const llvm::Value *> m_changed2;
+  /// ptr to m_changed1, m_changed2
+  std::vector<const llvm::Value *> *m_pChangedOld;
+  std::vector<const llvm::Value *> *m_pChangedNew;
+
+  std::vector<const llvm::Instruction*> m_backwardList;
+
+  llvm::Function *m_func = nullptr;
+  llvm::DominatorTree *DT = nullptr;
+  llvm::PostDominatorTree *PDT = nullptr;
+};
+
+} // end of namespace pktz
+
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder.hpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder.hpp
new file mode 100644
index 000000000000..67d310fdf826
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder.hpp
@@ -0,0 +1,1035 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// @file gen_builder.hpp
+//
+// @brief auto-generated file
+//
+// DO NOT EDIT
+//
+// Generation Command Line:
+//  gen_llvm_ir_macros.py
+//    --input
+//    /cygdrive/d/cm-llvm/llvm/include/llvm/IR/IRBuilder.h
+//    --output-dir
+//    .
+//    --gen_h
+//    --gen_meta_h
+//    --gen_intrin_h
+//
+//============================================================================
+// clang-format off
+#pragma once
+
+//============================================================================
+// Auto-generated Builder IR Wrappers
+//============================================================================
+GlobalVariable* GLOBAL_STRING(StringRef Str, const Twine &Name = "", unsigned AddressSpace = 0)
+{
+    return IRB()->CreateGlobalString(Str, Name, AddressSpace);
+}
+
+CallInst* MEMSET(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr)
+{
+    return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
+}
+
+CallInst* MEMSET(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr)
+{
+    return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
+}
+
+CallInst* MEMCOPY(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr)
+{
+    return IRB()->CreateMemCpy(Dst, Align, Src, Align, Size, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag);
+}
+
+CallInst* MEMCOPY(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr)
+{
+    return IRB()->CreateMemCpy(Dst, Align, Src, Align, Size, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag);
+}
+
+CallInst* ELEMENT_UNORDERED_ATOMIC_MEM_CPY(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, uint64_t Size, uint32_t ElementSize, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr)
+{
+    return IRB()->CreateElementUnorderedAtomicMemCpy(Dst, DstAlign, Src, SrcAlign, Size, ElementSize, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag);
+}
+
+CallInst* ELEMENT_UNORDERED_ATOMIC_MEM_CPY(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, Value *Size, uint32_t ElementSize, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr)
+{
+    return IRB()->CreateElementUnorderedAtomicMemCpy(Dst, DstAlign, Src, SrcAlign, Size, ElementSize, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag);
+}
+
+CallInst* MEMMOVE(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr)
+{
+	return IRB()->CreateMemMove(Dst, Align, Src, Align, Size, isVolatile, TBAATag, ScopeTag, NoAliasTag);
+}
+
+CallInst* MEMMOVE(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr)
+{
+    return IRB()->CreateMemMove(Dst, Align, Src, Align, Size, isVolatile, TBAATag, ScopeTag, NoAliasTag);
+}
+
+CallInst* FADD_REDUCE(Value *Acc, Value *Src)
+{
+    return IRB()->CreateFAddReduce(Acc, Src);
+}
+
+CallInst* FMUL_REDUCE(Value *Acc, Value *Src)
+{
+    return IRB()->CreateFMulReduce(Acc, Src);
+}
+
+CallInst* ADD_REDUCE(Value *Src)
+{
+    return IRB()->CreateAddReduce(Src);
+}
+
+CallInst* MUL_REDUCE(Value *Src)
+{
+    return IRB()->CreateMulReduce(Src);
+}
+
+CallInst* AND_REDUCE(Value *Src)
+{
+    return IRB()->CreateAndReduce(Src);
+}
+
+CallInst* OR_REDUCE(Value *Src)
+{
+    return IRB()->CreateOrReduce(Src);
+}
+
+CallInst* XOR_REDUCE(Value *Src)
+{
+    return IRB()->CreateXorReduce(Src);
+}
+
+CallInst* INT_MAX_REDUCE(Value *Src, bool IsSigned = false)
+{
+    return IRB()->CreateIntMaxReduce(Src, IsSigned);
+}
+
+CallInst* INT_MIN_REDUCE(Value *Src, bool IsSigned = false)
+{
+    return IRB()->CreateIntMinReduce(Src, IsSigned);
+}
+
+CallInst* FP_MAX_REDUCE(Value *Src, bool NoNaN = false)
+{
+    return IRB()->CreateFPMaxReduce(Src, NoNaN);
+}
+
+CallInst* FP_MIN_REDUCE(Value *Src, bool NoNaN = false)
+{
+    return IRB()->CreateFPMinReduce(Src, NoNaN);
+}
+
+CallInst* LIFETIME_START(Value *Ptr, ConstantInt *Size = nullptr)
+{
+    return IRB()->CreateLifetimeStart(Ptr, Size);
+}
+
+CallInst* LIFETIME_END(Value *Ptr, ConstantInt *Size = nullptr)
+{
+    return IRB()->CreateLifetimeEnd(Ptr, Size);
+}
+
+CallInst* INVARIANT_START(Value *Ptr, ConstantInt *Size = nullptr)
+{
+    return IRB()->CreateInvariantStart(Ptr, Size);
+}
+
+CallInst* MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask)
+{
+    return IRB()->CreateMaskedStore(Val, Ptr, Align, Mask);
+}
+
+CallInst* MASKED_GATHER(Value *Ptrs, unsigned Align, Value *Mask = nullptr, Value *PassThru = nullptr, const Twine& Name = "")
+{
+    return IRB()->CreateMaskedGather(Ptrs, Align, Mask, PassThru, Name);
+}
+
+CallInst* MASKED_SCATTER(Value *Val, Value *Ptrs, unsigned Align, Value *Mask = nullptr)
+{
+    return IRB()->CreateMaskedScatter(Val, Ptrs, Align, Mask);
+}
+
+CallInst* ASSUMPTION(Value *Cond)
+{
+    return IRB()->CreateAssumption(Cond);
+}
+
+CallInst* GC_STATEPOINT_CALL(uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, ArrayRef<Value *> CallArgs, ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name = "")
+{
+    return IRB()->CreateGCStatepointCall(ID, NumPatchBytes, ActualCallee, CallArgs, DeoptArgs, GCArgs, Name);
+}
+
+CallInst* GC_STATEPOINT_CALL(uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, uint32_t Flags, ArrayRef<Use> CallArgs, ArrayRef<Use> TransitionArgs, ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name = "")
+{
+    return IRB()->CreateGCStatepointCall(ID, NumPatchBytes, ActualCallee, Flags, CallArgs, TransitionArgs, DeoptArgs, GCArgs, Name);
+}
+
+CallInst* GC_STATEPOINT_CALL(uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, ArrayRef<Use> CallArgs, ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name = "")
+{
+    return IRB()->CreateGCStatepointCall(ID, NumPatchBytes, ActualCallee, CallArgs, DeoptArgs, GCArgs, Name);
+}
+
+InvokeInst* GC_STATEPOINT_INVOKE(uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Value *> InvokeArgs, ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name = "")
+{
+    return IRB()->CreateGCStatepointInvoke(ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, InvokeArgs, DeoptArgs, GCArgs, Name);
+}
+
+InvokeInst* GC_STATEPOINT_INVOKE(uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags, ArrayRef<Use> InvokeArgs, ArrayRef<Use> TransitionArgs, ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name = "")
+{
+    return IRB()->CreateGCStatepointInvoke(ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, Flags, InvokeArgs, TransitionArgs, DeoptArgs, GCArgs, Name);
+}
+
+InvokeInst* GC_STATEPOINT_INVOKE(uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Use> InvokeArgs, ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name = "")
+{
+    return IRB()->CreateGCStatepointInvoke(ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, InvokeArgs, DeoptArgs, GCArgs, Name);
+}
+
+CallInst* GC_RESULT(Instruction *Statepoint, Type *ResultType, const Twine &Name = "")
+{
+    return IRB()->CreateGCResult(Statepoint, ResultType, Name);
+}
+
+CallInst* GC_RELOCATE(Instruction *Statepoint, int BaseOffset, int DerivedOffset, Type *ResultType, const Twine &Name = "")
+{
+    return IRB()->CreateGCRelocate(Statepoint, BaseOffset, DerivedOffset, ResultType, Name);
+}
+
+CallInst* BINARY_INTRINSIC(Intrinsic::ID ID, Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateBinaryIntrinsic(ID, LHS, RHS, nullptr, Name);
+}
+
+CallInst* MIN_NUM(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateMinNum(LHS, RHS, Name);
+}
+
+CallInst* MAX_NUM(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateMaxNum(LHS, RHS, Name);
+}
+
+ReturnInst* RET_VOID()
+{
+    return IRB()->CreateRetVoid();
+}
+
+ReturnInst* RET(Value *V)
+{
+    return IRB()->CreateRet(V);
+}
+
+ReturnInst* AGGREGATE_RET(Value *const *retVals, unsigned N)
+{
+    return IRB()->CreateAggregateRet(retVals, N);
+}
+
+BranchInst* BR(BasicBlock *Dest)
+{
+    return IRB()->CreateBr(Dest);
+}
+
+BranchInst* COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights = nullptr, MDNode *Unpredictable = nullptr)
+{
+    return IRB()->CreateCondBr(Cond, True, False, BranchWeights, Unpredictable);
+}
+
+BranchInst* COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, Instruction *MDSrc)
+{
+    return IRB()->CreateCondBr(Cond, True, False, MDSrc);
+}
+
+SwitchInst* SWITCH(Value *V, BasicBlock *Dest, unsigned NumCases = 10, MDNode *BranchWeights = nullptr, MDNode *Unpredictable = nullptr)
+{
+    return IRB()->CreateSwitch(V, Dest, NumCases, BranchWeights, Unpredictable);
+}
+
+IndirectBrInst* INDIRECT_BR(Value *Addr, unsigned NumDests = 10)
+{
+    return IRB()->CreateIndirectBr(Addr, NumDests);
+}
+
+InvokeInst* INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Value *> Args = None, const Twine &Name = "")
+{
+    return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Args, Name);
+}
+
+InvokeInst* INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> OpBundles, const Twine &Name = "")
+{
+    return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Args, OpBundles, Name);
+}
+
+ResumeInst* RESUME(Value *Exn)
+{
+    return IRB()->CreateResume(Exn);
+}
+
+CleanupReturnInst* CLEANUP_RET(CleanupPadInst *CleanupPad, BasicBlock *UnwindBB = nullptr)
+{
+    return IRB()->CreateCleanupRet(CleanupPad, UnwindBB);
+}
+
+CatchSwitchInst* CATCH_SWITCH(Value *ParentPad, BasicBlock *UnwindBB, unsigned NumHandlers, const Twine &Name = "")
+{
+    return IRB()->CreateCatchSwitch(ParentPad, UnwindBB, NumHandlers, Name);
+}
+
+CatchPadInst* CATCH_PAD(Value *ParentPad, ArrayRef<Value *> Args, const Twine &Name = "")
+{
+    return IRB()->CreateCatchPad(ParentPad, Args, Name);
+}
+
+CleanupPadInst* CLEANUP_PAD(Value *ParentPad, ArrayRef<Value *> Args = None, const Twine &Name = "")
+{
+    return IRB()->CreateCleanupPad(ParentPad, Args, Name);
+}
+
+CatchReturnInst* CATCH_RET(CatchPadInst *CatchPad, BasicBlock *BB)
+{
+    return IRB()->CreateCatchRet(CatchPad, BB);
+}
+
+UnreachableInst* UNREACHABLE()
+{
+    return IRB()->CreateUnreachable();
+}
+
+Value* ADD(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false)
+{
+    return IRB()->CreateAdd(LHS, RHS, Name, HasNUW, HasNSW);
+}
+
+Value* NSW_ADD(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateNSWAdd(LHS, RHS, Name);
+}
+
+Value* NUW_ADD(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateNUWAdd(LHS, RHS, Name);
+}
+
+Value* FADD(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFAdd(LHS, RHS, Name, FPMathTag);
+}
+
+Value* SUB(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false)
+{
+    return IRB()->CreateSub(LHS, RHS, Name, HasNUW, HasNSW);
+}
+
+Value* NSW_SUB(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateNSWSub(LHS, RHS, Name);
+}
+
+Value* NUW_SUB(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateNUWSub(LHS, RHS, Name);
+}
+
+Value* FSUB(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFSub(LHS, RHS, Name, FPMathTag);
+}
+
+Value* MUL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false)
+{
+    return IRB()->CreateMul(LHS, RHS, Name, HasNUW, HasNSW);
+}
+
+Value* NSW_MUL(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateNSWMul(LHS, RHS, Name);
+}
+
+Value* NUW_MUL(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateNUWMul(LHS, RHS, Name);
+}
+
+Value* FMUL(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFMul(LHS, RHS, Name, FPMathTag);
+}
+
+Value* UDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false)
+{
+    return IRB()->CreateUDiv(LHS, RHS, Name, isExact);
+}
+
+Value* EXACT_U_DIV(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateExactUDiv(LHS, RHS, Name);
+}
+
+Value* SDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false)
+{
+    return IRB()->CreateSDiv(LHS, RHS, Name, isExact);
+}
+
+Value* EXACT_S_DIV(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateExactSDiv(LHS, RHS, Name);
+}
+
+Value* FDIV(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFDiv(LHS, RHS, Name, FPMathTag);
+}
+
+Value* UREM(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateURem(LHS, RHS, Name);
+}
+
+Value* SREM(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateSRem(LHS, RHS, Name);
+}
+
+Value* FREM(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFRem(LHS, RHS, Name, FPMathTag);
+}
+
+Value* SHL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false)
+{
+    return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW);
+}
+
+Value* SHL(Value *LHS, const APInt &RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false)
+{
+    return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW);
+}
+
+Value* SHL(Value *LHS, uint64_t RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false)
+{
+    return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW);
+}
+
+Value* LSHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false)
+{
+    return IRB()->CreateLShr(LHS, RHS, Name, isExact);
+}
+
+Value* LSHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false)
+{
+    return IRB()->CreateLShr(LHS, RHS, Name, isExact);
+}
+
+Value* LSHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false)
+{
+    return IRB()->CreateLShr(LHS, RHS, Name, isExact);
+}
+
+Value* ASHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false)
+{
+    return IRB()->CreateAShr(LHS, RHS, Name, isExact);
+}
+
+Value* ASHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false)
+{
+    return IRB()->CreateAShr(LHS, RHS, Name, isExact);
+}
+
+Value* ASHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false)
+{
+    return IRB()->CreateAShr(LHS, RHS, Name, isExact);
+}
+
+Value* AND(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateAnd(LHS, RHS, Name);
+}
+
+Value* AND(Value *LHS, const APInt &RHS, const Twine &Name = "")
+{
+    return IRB()->CreateAnd(LHS, RHS, Name);
+}
+
+Value* AND(Value *LHS, uint64_t RHS, const Twine &Name = "")
+{
+    return IRB()->CreateAnd(LHS, RHS, Name);
+}
+
+Value* OR(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateOr(LHS, RHS, Name);
+}
+
+Value* OR(Value *LHS, const APInt &RHS, const Twine &Name = "")
+{
+    return IRB()->CreateOr(LHS, RHS, Name);
+}
+
+Value* OR(Value *LHS, uint64_t RHS, const Twine &Name = "")
+{
+    return IRB()->CreateOr(LHS, RHS, Name);
+}
+
+Value* XOR(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateXor(LHS, RHS, Name);
+}
+
+Value* XOR(Value *LHS, const APInt &RHS, const Twine &Name = "")
+{
+    return IRB()->CreateXor(LHS, RHS, Name);
+}
+
+Value* XOR(Value *LHS, uint64_t RHS, const Twine &Name = "")
+{
+    return IRB()->CreateXor(LHS, RHS, Name);
+}
+
+Value* BINOP(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateBinOp(Opc, LHS, RHS, Name, FPMathTag);
+}
+
+Value* NEG(Value *V, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false)
+{
+    return IRB()->CreateNeg(V, Name, HasNUW, HasNSW);
+}
+
+Value* NSW_NEG(Value *V, const Twine &Name = "")
+{
+    return IRB()->CreateNSWNeg(V, Name);
+}
+
+Value* NUW_NEG(Value *V, const Twine &Name = "")
+{
+    return IRB()->CreateNUWNeg(V, Name);
+}
+
+Value* FNEG(Value *V, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFNeg(V, Name, FPMathTag);
+}
+
+Value* NOT(Value *V, const Twine &Name = "")
+{
+    return IRB()->CreateNot(V, Name);
+}
+
+AllocaInst* ALLOCA(Type *Ty, unsigned AddrSpace, Value *ArraySize = nullptr, const Twine &Name = "")
+{
+    return IRB()->CreateAlloca(Ty, ArraySize, Name, AddrSpace /* IGCLLVM wrapper interface */);
+}
+
+AllocaInst* ALLOCA(Type *Ty, Value *ArraySize = nullptr, const Twine &Name = "")
+{
+    return IRB()->CreateAlloca(Ty, ArraySize, Name);
+}
+
+StoreInst* STORE(Value *Val, Value *Ptr, bool isVolatile = false)
+{
+    return IRB()->CreateStore(Val, Ptr, isVolatile);
+}
+
+LoadInst* ALIGNED_LOAD(Value *Ptr, unsigned Align, const char *Name)
+{
+    return IRB()->CreateAlignedLoad(Ptr, Align, Name);
+}
+
+LoadInst* ALIGNED_LOAD(Value *Ptr, unsigned Align, const Twine &Name = "")
+{
+    return IRB()->CreateAlignedLoad(Ptr, Align, Name);
+}
+
+LoadInst* ALIGNED_LOAD(Value *Ptr, unsigned Align, bool isVolatile, const Twine &Name = "")
+{
+    return IRB()->CreateAlignedLoad(Ptr, Align, isVolatile, Name);
+}
+
+StoreInst* ALIGNED_STORE(Value *Val, Value *Ptr, unsigned Align, bool isVolatile = false)
+{
+    return IRB()->CreateAlignedStore(Val, Ptr, Align, isVolatile);
+}
+
+FenceInst* FENCE(AtomicOrdering Ordering, SyncScope::ID SSID = SyncScope::System, const Twine &Name = "")
+{
+    return IRB()->CreateFence(Ordering, SSID, Name);
+}
+
+AtomicCmpXchgInst* ATOMIC_CMP_XCHG(Value *Ptr, Value *Cmp, Value *New, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID = SyncScope::System)
+{
+    return IRB()->CreateAtomicCmpXchg(Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SSID);
+}
+
+AtomicRMWInst* ATOMIC_RMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, AtomicOrdering Ordering, SyncScope::ID SSID = SyncScope::System)
+{
+    return IRB()->CreateAtomicRMW(Op, Ptr, Val, Ordering, SSID);
+}
+
+Value* IN_BOUNDS_GEP(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name = "")
+{
+    return IRB()->CreateInBoundsGEP(Ptr, IdxList, Name);
+}
+
+Value* IN_BOUNDS_GEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name = "")
+{
+    return IRB()->CreateInBoundsGEP(Ty, Ptr, IdxList, Name);
+}
+
+Value* IN_BOUNDS_GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name = "")
+{
+    return IRB()->CreateInBoundsGEP(Ty, Ptr, Idx, Name);
+}
+
+Value* CONST_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name = "")
+{
+    return IRB()->CreateConstGEP1_32(Ptr, Idx0, Name);
+}
+
+Value* CONST_GEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name = "")
+{
+    return IRB()->CreateConstGEP1_32(Ty, Ptr, Idx0, Name);
+}
+
+Value* CONST_IN_BOUNDS_GEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name = "")
+{
+    return IRB()->CreateConstInBoundsGEP1_32(Ty, Ptr, Idx0, Name);
+}
+
+Value* CONST_GEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = "")
+{
+    return IRB()->CreateConstGEP2_32(Ty, Ptr, Idx0, Idx1, Name);
+}
+
+Value* CONST_IN_BOUNDS_GEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = "")
+{
+    return IRB()->CreateConstInBoundsGEP2_32(Ty, Ptr, Idx0, Idx1, Name);
+}
+
+Value* CONST_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = "")
+{
+    return IRB()->CreateConstGEP1_64(Ptr, Idx0, Name);
+}
+
+Value* CONST_IN_BOUNDS_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = "")
+{
+    return IRB()->CreateConstInBoundsGEP1_64(Ptr, Idx0, Name);
+}
+
+Value* CONST_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = "")
+{
+    return IRB()->CreateConstGEP2_64(Ptr, Idx0, Idx1, Name);
+}
+
+Value* CONST_IN_BOUNDS_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = "")
+{
+    return IRB()->CreateConstInBoundsGEP2_64(Ptr, Idx0, Idx1, Name);
+}
+
+Value* STRUCT_GEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name = "")
+{
+    return IRB()->CreateStructGEP(Ty, Ptr, Idx, Name);
+}
+
+Value* GLOBAL_STRING_PTR(StringRef Str, const Twine &Name = "", unsigned AddressSpace = 0)
+{
+    return IRB()->CreateGlobalStringPtr(Str, Name, AddressSpace);
+}
+
+Value* TRUNC(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateTrunc(V, DestTy, Name);
+}
+
+Value* Z_EXT(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateZExt(V, DestTy, Name);
+}
+
+Value* S_EXT(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateSExt(V, DestTy, Name);
+}
+
+Value* Z_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateZExtOrTrunc(V, DestTy, Name);
+}
+
+Value* S_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateSExtOrTrunc(V, DestTy, Name);
+}
+
+Value* FP_TO_UI(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateFPToUI(V, DestTy, Name);
+}
+
+Value* FP_TO_SI(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateFPToSI(V, DestTy, Name);
+}
+
+Value* UI_TO_FP(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateUIToFP(V, DestTy, Name);
+}
+
+Value* SI_TO_FP(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateSIToFP(V, DestTy, Name);
+}
+
+Value* FP_TRUNC(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateFPTrunc(V, DestTy, Name);
+}
+
+Value* FP_EXT(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateFPExt(V, DestTy, Name);
+}
+
+Value* PTR_TO_INT(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreatePtrToInt(V, DestTy, Name);
+}
+
+Value* INT_TO_PTR(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateIntToPtr(V, DestTy, Name);
+}
+
+Value* BITCAST(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateBitCast(V, DestTy, Name);
+}
+
+Value* ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateAddrSpaceCast(V, DestTy, Name);
+}
+
+Value* Z_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateZExtOrBitCast(V, DestTy, Name);
+}
+
+Value* S_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateSExtOrBitCast(V, DestTy, Name);
+}
+
+Value* TRUNC_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateTruncOrBitCast(V, DestTy, Name);
+}
+
+Value* CAST(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateCast(Op, V, DestTy, Name);
+}
+
+Value* POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreatePointerCast(V, DestTy, Name);
+}
+
+Value* POINTER_BIT_CAST_OR_ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreatePointerBitCastOrAddrSpaceCast(V, DestTy, Name);
+}
+
+Value* INT_CAST(Value *V, Type *DestTy, bool isSigned, const Twine &Name = "")
+{
+    return IRB()->CreateIntCast(V, DestTy, isSigned, Name);
+}
+
+Value* BIT_OR_POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateBitOrPointerCast(V, DestTy, Name);
+}
+
+Value* FP_CAST(Value *V, Type *DestTy, const Twine &Name = "")
+{
+    return IRB()->CreateFPCast(V, DestTy, Name);
+}
+
+Value* ICMP_EQ(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpEQ(LHS, RHS, Name);
+}
+
+Value* ICMP_NE(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpNE(LHS, RHS, Name);
+}
+
+Value* ICMP_UGT(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpUGT(LHS, RHS, Name);
+}
+
+Value* ICMP_UGE(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpUGE(LHS, RHS, Name);
+}
+
+Value* ICMP_ULT(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpULT(LHS, RHS, Name);
+}
+
+Value* ICMP_ULE(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpULE(LHS, RHS, Name);
+}
+
+Value* ICMP_SGT(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpSGT(LHS, RHS, Name);
+}
+
+Value* ICMP_SGE(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpSGE(LHS, RHS, Name);
+}
+
+Value* ICMP_SLT(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpSLT(LHS, RHS, Name);
+}
+
+Value* ICMP_SLE(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmpSLE(LHS, RHS, Name);
+}
+
+Value* FCMP_OEQ(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpOEQ(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_OGT(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpOGT(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_OGE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpOGE(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_OLT(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpOLT(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_OLE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpOLE(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_ONE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpONE(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_ORD(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpORD(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_UNO(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpUNO(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_UEQ(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpUEQ(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_UGT(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpUGT(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_UGE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpUGE(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_ULT(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpULT(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_ULE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpULE(LHS, RHS, Name, FPMathTag);
+}
+
+Value* FCMP_UNE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmpUNE(LHS, RHS, Name, FPMathTag);
+}
+
+Value* ICMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreateICmp(P, LHS, RHS, Name);
+}
+
+Value* FCMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateFCmp(P, LHS, RHS, Name, FPMathTag);
+}
+
+PHINode* PHI(Type *Ty, unsigned NumReservedValues, const Twine &Name = "")
+{
+    return IRB()->CreatePHI(Ty, NumReservedValues, Name);
+}
+
+CallInst* CALLA(Value *Callee, ArrayRef<Value *> Args = None, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateCall(Callee, Args, Name, FPMathTag);
+}
+
+CallInst* CALLA(FunctionType *FTy, Value *Callee, ArrayRef<Value *> Args, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateCall(FTy, Callee, Args, Name, FPMathTag);
+}
+
+CallInst* CALLA(Value *Callee, ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> OpBundles, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateCall(Callee, Args, OpBundles, Name, FPMathTag);
+}
+
+CallInst* CALLA(Function *Callee, ArrayRef<Value *> Args, const Twine &Name = "", MDNode *FPMathTag = nullptr)
+{
+    return IRB()->CreateCall(Callee, Args, Name, FPMathTag);
+}
+
+Value* SELECT(Value *C, Value *True, Value *False, const Twine &Name = "", Instruction *MDFrom = nullptr)
+{
+    return IRB()->CreateSelect(C, True, False, Name, MDFrom);
+}
+
+VAArgInst* VA_ARG(Value *List, Type *Ty, const Twine &Name = "")
+{
+    return IRB()->CreateVAArg(List, Ty, Name);
+}
+
+Value* VEXTRACT(Value *Vec, Value *Idx, const Twine &Name = "")
+{
+    return IRB()->CreateExtractElement(Vec, Idx, Name);
+}
+
+Value* VEXTRACT(Value *Vec, uint64_t Idx, const Twine &Name = "")
+{
+    return IRB()->CreateExtractElement(Vec, Idx, Name);
+}
+
+Value* VINSERT(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name = "")
+{
+    return IRB()->CreateInsertElement(Vec, NewElt, Idx, Name);
+}
+
+Value* VINSERT(Value *Vec, Value *NewElt, uint64_t Idx, const Twine &Name = "")
+{
+    return IRB()->CreateInsertElement(Vec, NewElt, Idx, Name);
+}
+
+Value* VSHUFFLE(Value *V1, Value *V2, Value *Mask, const Twine &Name = "")
+{
+    return IRB()->CreateShuffleVector(V1, V2, Mask, Name);
+}
+
+Value* VSHUFFLE(Value *V1, Value *V2, ArrayRef<uint32_t> IntMask, const Twine &Name = "")
+{
+    return IRB()->CreateShuffleVector(V1, V2, IntMask, Name);
+}
+
+Value* EXTRACT_VALUE(Value *Agg, ArrayRef<unsigned> Idxs, const Twine &Name = "")
+{
+    return IRB()->CreateExtractValue(Agg, Idxs, Name);
+}
+
+Value* INSERT_VALUE(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, const Twine &Name = "")
+{
+    return IRB()->CreateInsertValue(Agg, Val, Idxs, Name);
+}
+
+LandingPadInst* LANDING_PAD(Type *Ty, unsigned NumClauses, const Twine &Name = "")
+{
+    return IRB()->CreateLandingPad(Ty, NumClauses, Name);
+}
+
+Value* IS_NULL(Value *Arg, const Twine &Name = "")
+{
+    return IRB()->CreateIsNull(Arg, Name);
+}
+
+Value* IS_NOT_NULL(Value *Arg, const Twine &Name = "")
+{
+    return IRB()->CreateIsNotNull(Arg, Name);
+}
+
+Value* PTR_DIFF(Value *LHS, Value *RHS, const Twine &Name = "")
+{
+    return IRB()->CreatePtrDiff(LHS, RHS, Name);
+}
+
+Value* INVARIANT_GROUP_BARRIER(Value *Ptr)
+{
+    return IRB()->CreateLaunderInvariantGroup(Ptr);
+}
+
+Value* VECTOR_SPLAT(unsigned NumElts, Value *V, const Twine &Name = "")
+{
+    return IRB()->CreateVectorSplat(NumElts, V, Name);
+}
+
+Value* EXTRACT_INTEGER(const DataLayout &DL, Value *From, IntegerType *ExtractedTy, uint64_t Offset, const Twine &Name)
+{
+    return IRB()->CreateExtractInteger(DL, From, ExtractedTy, Offset, Name);
+}
+
+CallInst* ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue = nullptr)
+{
+    return IRB()->CreateAlignmentAssumption(DL, PtrValue, Alignment, OffsetValue);
+}
+
+CallInst* ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, Value *Alignment, Value *OffsetValue = nullptr)
+{
+    return IRB()->CreateAlignmentAssumption(DL, PtrValue, Alignment, OffsetValue);
+}
+
+    // clang-format on
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_intrin.hpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_intrin.hpp
new file mode 100644
index 000000000000..d19a1bb3dfcc
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_intrin.hpp
@@ -0,0 +1,172 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// @file gen_builder_intrin.hpp
+//
+// @brief auto-generated file
+//
+// DO NOT EDIT
+//
+// Generation Command Line:
+//  gen_llvm_ir_macros.py
+//    --input
+//    /cygdrive/d/cm-llvm/llvm/include/llvm/IR/IRBuilder.h
+//    --output-dir
+//    .
+//    --gen_h
+//    --gen_meta_h
+//    --gen_intrin_h
+//
+//============================================================================
+// clang-format off
+#pragma once
+
+//============================================================================
+// Auto-generated llvm intrinsics
+//============================================================================
+Value* CTTZ(Value* a, Value* flag, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::cttz, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a, flag}, name);
+}
+
+Value* CTLZ(Value* a, Value* flag, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::ctlz, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a, flag}, name);
+}
+
+Value* VSQRTPS(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::sqrt, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* STACKSAVE(const llvm::Twine& name = "")
+{
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::stacksave);
+    return CALL(pFunc, std::initializer_list<Value*>{}, name);
+}
+
+Value* STACKRESTORE(Value* a, const llvm::Twine& name = "")
+{
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::stackrestore);
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* VMINPS(Value* a, Value* b, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::minnum, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a, b}, name);
+}
+
+Value* VMAXPS(Value* a, Value* b, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::maxnum, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a, b}, name);
+}
+
+Value* DEBUGTRAP(const llvm::Twine& name = "")
+{
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::debugtrap);
+    return CALL(pFunc, std::initializer_list<Value*>{}, name);
+}
+
+Value* POPCNT(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::ctpop, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* LOG2(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::log2, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* FABS(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::fabs, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* EXP2(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::exp2, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* COS(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::cos, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* SIN(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::sin, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* FLOOR(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::floor, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* POW(Value* a, Value* b, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> args;
+    args.push_back(a->getType());
+    Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::pow, args);
+    return CALL(pFunc, std::initializer_list<Value*>{a, b}, name);
+}
+
+    // clang-format on
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_meta.hpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_meta.hpp
new file mode 100644
index 000000000000..34d692069f7a
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_meta.hpp
@@ -0,0 +1,244 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// @file gen_builder_meta.hpp
+//
+// @brief auto-generated file
+//
+// DO NOT EDIT
+//
+// Generation Command Line:
+//  gen_llvm_ir_macros.py
+//    --input
+//    /cygdrive/d/cm-llvm/llvm/include/llvm/IR/IRBuilder.h
+//    --output-dir
+//    .
+//    --gen_h
+//    --gen_meta_h
+//    --gen_intrin_h
+//
+//============================================================================
+// clang-format off
+#pragma once
+
+//============================================================================
+// Auto-generated meta intrinsics
+//============================================================================
+Value* VGATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 5> argTypes;
+    argTypes.push_back(src->getType());
+    argTypes.push_back(pBase->getType());
+    argTypes.push_back(indices->getType());
+    argTypes.push_back(mask->getType());
+    argTypes.push_back(scale->getType());
+    FunctionType* pFuncTy = FunctionType::get(src->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VGATHERPD", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{src, pBase, indices, mask, scale}, name);
+}
+
+Value* VGATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 5> argTypes;
+    argTypes.push_back(src->getType());
+    argTypes.push_back(pBase->getType());
+    argTypes.push_back(indices->getType());
+    argTypes.push_back(mask->getType());
+    argTypes.push_back(scale->getType());
+    FunctionType* pFuncTy = FunctionType::get(src->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VGATHERPS", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{src, pBase, indices, mask, scale}, name);
+}
+
+Value* VGATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 5> argTypes;
+    argTypes.push_back(src->getType());
+    argTypes.push_back(pBase->getType());
+    argTypes.push_back(indices->getType());
+    argTypes.push_back(mask->getType());
+    argTypes.push_back(scale->getType());
+    FunctionType* pFuncTy = FunctionType::get(src->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VGATHERDD", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{src, pBase, indices, mask, scale}, name);
+}
+
+Value* VRCPPS(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> argTypes;
+    argTypes.push_back(a->getType());
+    FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VRCPPS", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* VROUND(Value* a, Value* rounding, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(rounding->getType());
+    FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VROUND", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, rounding}, name);
+}
+
+Value* BEXTR_32(Value* src, Value* control, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(src->getType());
+    argTypes.push_back(control->getType());
+    FunctionType* pFuncTy = FunctionType::get(src->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.BEXTR_32", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{src, control}, name);
+}
+
+Value* VPSHUFB(Value* a, Value* b, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(b->getType());
+    FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VPSHUFB", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, b}, name);
+}
+
+Value* VPERMD(Value* a, Value* idx, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(idx->getType());
+    FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VPERMD", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, idx}, name);
+}
+
+Value* VPERMPS(Value* idx, Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(idx->getType());
+    argTypes.push_back(a->getType());
+    FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VPERMPS", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{idx, a}, name);
+}
+
+Value* VCVTPD2PS(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> argTypes;
+    argTypes.push_back(a->getType());
+    FunctionType* pFuncTy = FunctionType::get(VectorType::get(mFP32Ty, a->getType()->getVectorNumElements()), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VCVTPD2PS", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* VCVTPH2PS(Value* a, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 1> argTypes;
+    argTypes.push_back(a->getType());
+    FunctionType* pFuncTy = FunctionType::get(VectorType::get(mFP32Ty, a->getType()->getVectorNumElements()), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VCVTPH2PS", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a}, name);
+}
+
+Value* VCVTPS2PH(Value* a, Value* round, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(round->getType());
+    FunctionType* pFuncTy = FunctionType::get(mSimdInt16Ty, argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VCVTPS2PH", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, round}, name);
+}
+
+Value* VHSUBPS(Value* a, Value* b, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(b->getType());
+    FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VHSUBPS", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, b}, name);
+}
+
+Value* VPTESTC(Value* a, Value* b, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(b->getType());
+    FunctionType* pFuncTy = FunctionType::get(mInt32Ty, argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VPTESTC", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, b}, name);
+}
+
+Value* VPTESTZ(Value* a, Value* b, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(b->getType());
+    FunctionType* pFuncTy = FunctionType::get(mInt32Ty, argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VPTESTZ", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, b}, name);
+}
+
+Value* VFMADDPS(Value* a, Value* b, Value* c, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 3> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(b->getType());
+    argTypes.push_back(c->getType());
+    FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VFMADDPS", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, b, c}, name);
+}
+
+Value* VPHADDD(Value* a, Value* b, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(b->getType());
+    FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.VPHADDD", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, b}, name);
+}
+
+Value* PDEP32(Value* a, Value* b, const llvm::Twine& name = "")
+{
+    SmallVector<Type*, 2> argTypes;
+    argTypes.push_back(a->getType());
+    argTypes.push_back(b->getType());
+    FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.PDEP32", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{a, b}, name);
+}
+
+Value* RDTSC(const llvm::Twine& name = "")
+{
+    FunctionType* pFuncTy = FunctionType::get(mInt64Ty, {}, false);
+    Function* pFunc = cast<Function>(mpModule->getOrInsertFunction("meta.intrinsic.RDTSC", pFuncTy));
+    return CALL(pFunc, std::initializer_list<Value*>{}, name);
+}
+
+    // clang-format on
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMABI.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMABI.cpp
new file mode 100644
index 000000000000..02d3d7f9e86f
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMABI.cpp
@@ -0,0 +1,1942 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+//===----------------------------------------------------------------------===//
+//
+/// CMABI
+/// -----
+///
+/// This pass fixes ABI issues for the genx backend. Currently, it
+///
+/// - transforms pass by pointer argument into copy-in and copy-out;
+///
+/// - localizes global scalar or vector variables into copy-in and copy-out;
+///
+/// - passes bool arguments as i8 (matches cm-icl's hehavior).
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "cmabi"
+
+#include "llvmWrapper/Support/Alignment.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "vc/GenXOpts/GenXOpts.h"
+#include "vc/GenXOpts/Utils/GenXSTLExtras.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/GenXIntrinsics/GenXMetadata.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include "llvmWrapper/Analysis/CallGraph.h"
+
+#include <iterator>
+
+using namespace llvm;
+
+using LocalizationLimitT = int32_t;
+static constexpr auto LocalizeAll = std::numeric_limits<LocalizationLimitT>::max();
+static cl::opt<LocalizationLimitT>
+    LocalizationLimit("cm-abi-issues-localization-limit",
+                   cl::desc("maximum size (in bytes) used to localize global variables"),
+                   cl::init(LocalizeAll));
+
+STATISTIC(NumArgumentsTransformed, "Number of pointer arguments transformed");
+STATISTIC(NumArgumentsDead       , "Number of dead pointer args eliminated");
+
+namespace llvm {
+void initializeCMABIPass(PassRegistry &);
+void initializeCMLowerVLoadVStorePass(PassRegistry &);
+}
+
+/// Localizing global variables
+/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+///
+/// General idea of localizing global variables into locals. Globals used in
+/// different kernels get a seperate copy and they are always invisiable to
+/// other kernels and we can safely localize all globals used (including
+/// indirectly) in a kernel. For example,
+///
+/// .. code-block:: text
+///
+///   @gv1 = global <8 x float> zeroinitializer, align 32
+///   @gv2 = global <8 x float> zeroinitializer, align 32
+///   @gv3 = global <8 x float> zeroinitializer, align 32
+///
+///   define dllexport void @f0() {
+///     call @f1()
+///     call @f2()
+///     call @f3()
+///   }
+///
+///   define internal void @f1() {
+///     ; ...
+///     store <8 x float> %splat1, <8 x float>* @gv1, align 32
+///   }
+///
+///   define internal void @f2() {
+///     ; ...
+///     store <8 x float> %splat2, <8 x float>* @gv2, align 32
+///   }
+///
+///   define internal void @f3() {
+///     %1 = <8 x float>* @gv1, align 32
+///     %2 = <8 x float>* @gv2, align 32
+///     %3 = fadd <8 x float> %1, <8 x float> %2
+///     store <8 x float> %3, <8 x float>* @gv3, align 32
+///   }
+///
+/// will be transformed into
+///
+/// .. code-block:: text
+///
+///   define dllexport void @f0() {
+///     %v1 = alloca <8 x float>, align 32
+///     %v2 = alloca <8 x float>, align 32
+///     %v3 = alloca <8 x float>, align 32
+///
+///     %0 = load <8 x float> * %v1, align 32
+///     %1 = { <8 x float> } call @f1_transformed(<8 x float> %0)
+///     %2 = extractvalue { <8 x float> } %1, 0
+///     store <8  x float> %2, <8 x float>* %v1, align 32
+///
+///     %3 = load <8 x float> * %v2, align 32
+///     %4 = { <8 x float> } call @f2_transformed(<8 x float> %3)
+///     %5 = extractvalue { <8 x float> } %4, 0
+///     store <8  x float> %5, <8 x float>* %v1, align 32
+///
+///     %6 = load <8 x float> * %v1, align 32
+///     %7 = load <8 x float> * %v2, align 32
+///     %8 = load <8 x float> * %v3, align 32
+///
+///     %9 = { <8 x float>, <8 x float>, <8 x float> }
+///          call @f3_transformed(<8 x float> %6, <8 x float> %7, <8 x float> %8)
+///
+///     %10 = extractvalue { <8 x float>, <8 x float>, <8 x float> } %9, 0
+///     store <8  x float> %10, <8 x float>* %v1, align 32
+///     %11 = extractvalue { <8 x float>, <8 x float>, <8 x float> } %9, 1
+///     store <8  x float> %11, <8 x float>* %v2, align 32
+///     %12 = extractvalue { <8 x float>, <8 x float>, <8 x float> } %9, 2
+///     store <8  x float> %12, <8 x float>* %v3, align 32
+///   }
+///
+/// All callees will be updated accordingly, E.g. f1_transformed becomes
+///
+/// .. code-block:: text
+///
+///   define internal { <8 x float> } @f1_transformed(<8 x float> %v1) {
+///     %0 = alloca <8 x float>, align 32
+///     store <8 x float> %v1, <8 x float>* %0, align 32
+///     ; ...
+///     store <8 x float> %splat1, <8 x float>* @0, align 32
+///     ; ...
+///     %1 = load <8 x float>* %0, align 32
+///     %2 = insertvalue { <8 x float> } undef, <8 x float> %1, 0
+///     ret { <8 x float> } %2
+///   }
+///
+namespace {
+
+// \brief Collect necessary information for global variable localization.
+class LocalizationInfo {
+public:
+  typedef SetVector<GlobalVariable *> GlobalSetTy;
+
+  explicit LocalizationInfo(Function *F) : Fn(F) {}
+  LocalizationInfo() : Fn(0) {}
+
+  Function *getFunction() const { return Fn; }
+  bool empty() const { return Globals.empty(); }
+  GlobalSetTy &getGlobals() { return Globals; }
+
+  // \brief Add a global.
+  void addGlobal(GlobalVariable *GV) {
+    Globals.insert(GV);
+  }
+
+  // \brief Add all globals from callee.
+  void addGlobals(LocalizationInfo &LI) {
+    Globals.insert(LI.getGlobals().begin(), LI.getGlobals().end());
+  }
+
+  void setArgIndex(GlobalVariable *GV, unsigned ArgIndex) {
+    assert(!IndexMap.count(GV));
+    IndexMap[GV] = ArgIndex;
+  }
+  unsigned getArgIndex(GlobalVariable *GV) const {
+    assert(IndexMap.count(GV));
+    return IndexMap.lookup(GV);
+  }
+
+private:
+  // \brief The function being analyzed.
+  Function *Fn;
+
+  // \brief Global variables that are used directly or indirectly.
+  GlobalSetTy Globals;
+
+  // This map keeps track of argument index for a global variable.
+  SmallDenseMap<GlobalVariable *, unsigned> IndexMap;
+};
+
+// Diagnostic information for error/warning for overlapping arg
+class DiagnosticInfoOverlappingArgs : public DiagnosticInfo {
+private:
+  std::string Description;
+  StringRef Filename;
+  unsigned Line;
+  unsigned Col;
+  static int KindID;
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+public:
+  // Initialize from an Instruction and an Argument.
+  DiagnosticInfoOverlappingArgs(Instruction *Inst,
+      const Twine &Desc, DiagnosticSeverity Severity = DS_Error);
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+int DiagnosticInfoOverlappingArgs::KindID = 0;
+
+
+
+struct CMABI : public CallGraphSCCPass {
+  static char ID;
+
+  CMABI() : CallGraphSCCPass(ID) {
+    initializeCMABIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+
+  virtual bool runOnSCC(CallGraphSCC &SCC);
+
+  virtual bool doInitialization(CallGraph &CG);
+  virtual bool doFinalization(CallGraph &CG);
+
+private:
+  unsigned int const MaxCallSites = 5;
+
+  CallGraphNode *ProcessNode(CallGraphNode *CGN);
+
+  // Fix argument passing for kernels.
+  CallGraphNode *TransformKernel(Function *F);
+
+  // Major work is done in this method.
+  CallGraphNode *TransformNode(Function *F,
+                               SmallPtrSet<Argument *, 8> &ArgsToTransform,
+                               LocalizationInfo &LI);
+
+  // \brief Create allocas for globals and replace their uses.
+  void LocalizeGlobals(LocalizationInfo &LI);
+
+  // \brief Compute the localized global variables for each function.
+  void AnalyzeGlobals(CallGraph &CG);
+
+  // \brief Returns the localization info associated to a function.
+  LocalizationInfo &getLocalizationInfo(Function *F) {
+    if (!GlobalInfo.count(F)) {
+      LocalizationInfo *LI = new LocalizationInfo(F);
+      LocalizationInfoObjs.push_back(LI);
+      GlobalInfo[F] = LI;
+      return *LI;
+    }
+    return *GlobalInfo[F];
+  }
+
+  void addDirectGlobal(Function *F, GlobalVariable *GV) {
+    getLocalizationInfo(F).addGlobal(GV);
+  }
+
+  // \brief Add all globals from callee to caller.
+  void addIndirectGlobal(Function *F, Function *Callee) {
+    getLocalizationInfo(F).addGlobals(getLocalizationInfo(Callee));
+  }
+
+  // Return true if pointer type argument arg appears in a
+  // store instruction. This helps decide whether it is safe
+  // to convert ptr arg to byvalue arg. Latter can be passed
+  // in GRF.
+  bool IsPtrArgModified(Value * Arg);
+
+  // \brief Diagnose illegal overlapping by-ref args.
+  void diagnoseOverlappingArgs(CallInst *CI);
+
+  // This map captures all global variables to be localized.
+  SmallDenseMap<Function *, LocalizationInfo *> GlobalInfo;
+
+  // Kernels in the module being processed.
+  SmallPtrSet<Function *, 8> Kernels;
+
+  // Already visited functions.
+  SmallPtrSet<Function *, 8> AlreadyVisited;
+
+  // LocalizationInfo objects created.
+  SmallVector<LocalizationInfo *, 8> LocalizationInfoObjs;
+};
+
+} // namespace
+
+// Currently weight of the global defines by its size
+static int calcGVWeight(const GlobalVariable &GV, const DataLayout &DL) {
+  return DL.getTypeAllocSize(GV.getValueType());
+}
+
+/* selectGlobalsToLocalize - chooses which globals to localize
+ * Returns vector of pointers to such globals.
+ *
+ * Algorithm: exclude globals that definitely should not be localized
+ * sort globals by weight, choose first smallest ones, sum of which is under \p
+ * Bound
+ *
+ * \p Globals - range of globals to choose from
+ * \p Bound - bound not to overcome
+ * \p ExcludePred - functor : GVRef -> bool, true if global should not be
+ * localized \p WeightCalculator - functor : GVRef -> decltype(Bound), returns
+ * weight of global
+ */
+template <typename ForwardRange, typename ExcludePredT, typename T,
+          typename WeightCalculatorT>
+auto selectGlobalsToLocalize(ForwardRange Globals, T Bound,
+                             ExcludePredT ExcludePred,
+                             WeightCalculatorT WeightCalculator)
+    -> std::vector<genx::ranges::range_pointer_t<ForwardRange>> {
+  assert(Bound >= 0 && "bound must be nonnegative");
+  using GVPtr = genx::ranges::range_pointer_t<ForwardRange>;
+  using GVRef = genx::ranges::range_reference_t<ForwardRange>;
+  if (Bound == 0)
+    return std::vector<GVPtr>();
+
+  // filter out those, that we must exclude
+  auto Unexcluded = make_filter_range(
+      Globals, [ExcludePred](GVRef GV) { return !ExcludePred(GV); });
+  using GVWithWeightT = std::pair<GVPtr, int>;
+
+  if (Bound == LocalizeAll) {
+    std::vector<GVPtr> ToLocalize;
+    transform(Unexcluded, std::back_inserter(ToLocalize),
+              [](GVRef GV) { return &GV; });
+    return ToLocalize;
+  }
+
+  std::vector<GVWithWeightT> ToLocalizeWithWeight;
+  transform(Unexcluded, std::back_inserter(ToLocalizeWithWeight),
+            [WeightCalculator](GVRef GV) {
+              return std::make_pair(&GV, WeightCalculator(GV));
+            });
+
+  // sort globals by weight
+  std::sort(ToLocalizeWithWeight.begin(), ToLocalizeWithWeight.end(),
+            [](GVWithWeightT LHS, GVWithWeightT RHS) {
+              return LHS.second < RHS.second;
+            });
+
+  // filter max number of lightest ones, which weight sum is under the bound
+  auto FirstNotToLocalize = genx::upper_partial_sum_bound(
+      ToLocalizeWithWeight.begin(), ToLocalizeWithWeight.end(), Bound,
+      [](decltype(Bound) Base, GVWithWeightT Inc) {
+        return Base + Inc.second;
+      });
+
+  // collect them back to ToLocalize
+  std::vector<GVPtr> ToLocalize;
+  ToLocalize.reserve(FirstNotToLocalize - ToLocalizeWithWeight.begin());
+  std::transform(ToLocalizeWithWeight.begin(), FirstNotToLocalize,
+                 std::back_inserter(ToLocalize),
+                 [](GVWithWeightT GV) { return GV.first; });
+
+  return ToLocalize;
+}
+
+bool CMABI::doInitialization(CallGraph &CG) {
+  // Analyze global variable usages and for each function attaches global
+  // variables to be copy-in and copy-out.
+  AnalyzeGlobals(CG);
+
+  auto getValue = [](Metadata *M) -> Value * {
+    if (auto VM = dyn_cast<ValueAsMetadata>(M))
+      return VM->getValue();
+    return nullptr;
+  };
+
+  // Collect all CM kernels from named metadata.
+  if (NamedMDNode *Named =
+          CG.getModule().getNamedMetadata(genx::FunctionMD::GenXKernels)) {
+    assert(Named);
+    for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) {
+      MDNode *Node = Named->getOperand(I);
+      if (Function *F =
+              dyn_cast_or_null<Function>(getValue(Node->getOperand(0))))
+        Kernels.insert(F);
+    }
+  }
+
+  // no change.
+  return false;
+}
+
+bool CMABI::doFinalization(CallGraph &CG) {
+  bool Changed = false;
+  for (Module::global_iterator I = CG.getModule().global_begin();
+       I != CG.getModule().global_end();
+       /*empty*/) {
+    GlobalVariable *GV = &*I++;
+    if (GV->use_empty()) {
+      GV->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  for (LocalizationInfo *Obj : LocalizationInfoObjs)
+    delete Obj;
+
+  return Changed;
+}
+
+bool CMABI::runOnSCC(CallGraphSCC &SCC) {
+  bool Changed = false, LocalChange;
+
+  // Diagnose overlapping by-ref args.
+  for (auto i = SCC.begin(), e = SCC.end(); i != e; ++i) {
+    Function *F = (*i)->getFunction();
+    if (!F || F->empty())
+      continue;
+    for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) {
+      auto CI = dyn_cast<CallInst>(ui->getUser());
+      if (CI && CI->getNumArgOperands() == ui->getOperandNo())
+        diagnoseOverlappingArgs(CI);
+    }
+  }
+
+  // Iterate until we stop transforming from this SCC.
+  do {
+    LocalChange = false;
+    for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+      if (CallGraphNode *CGN = ProcessNode(*I)) {
+        LocalChange = true;
+        SCC.ReplaceNode(*I, CGN);
+      }
+    }
+    Changed |= LocalChange;
+  } while (LocalChange);
+
+  return Changed;
+}
+
+// Sometimes we can get phi with GEP (or maybe some other inst) as an argument.
+// While GEP's arguments are constants, its OK as GEP is a constant to.
+// But when we replace constants with lokals, GEP becomes a normal instruction,
+// a normal instruction, that is placed before phi - wrong IR, we need to fix
+// it. Here it is fixed.
+static void fixPhiUseIssue(Instruction *Inst) {
+  auto PhiUse = cast<PHINode>(Inst->use_begin()->getUser());
+  auto InstOpNoInPhi = Inst->use_begin()->getOperandNo();
+  assert(Inst->getParent() == PhiUse->getParent());
+  Inst->removeFromParent();
+  Inst->insertBefore(PhiUse->getIncomingBlock(InstOpNoInPhi)->getTerminator());
+}
+
+// Replace uses of global variables with the corresponding allocas with a
+// specified function.
+//
+// Returns vector of instructions with phi use, that should be later fixed.
+static std::vector<Instruction *>
+replaceUsesWithinFunction(SmallDenseMap<Value *, Value *> &GlobalsToReplace,
+                          Function *F) {
+  std::vector<Instruction *> PhiUseIssueInsts;
+  for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    Instruction *Inst = &*I;
+    for (unsigned i = 0, e = Inst->getNumOperands(); i < e; ++i) {
+      auto Iter = GlobalsToReplace.find(Inst->getOperand(i));
+      if (Iter != GlobalsToReplace.end())
+        Inst->setOperand(i, Iter->second);
+    }
+    if (Inst->getNumUses() == 1) {
+      auto PhiUse = dyn_cast<PHINode>(Inst->use_begin()->getUser());
+      if (PhiUse && Inst->getParent() == PhiUse->getParent()) {
+        PhiUseIssueInsts.push_back(Inst);
+      }
+    }
+  }
+  return PhiUseIssueInsts;
+}
+
+// \brief Create allocas for globals directly used in this kernel and
+// replace all uses.
+void CMABI::LocalizeGlobals(LocalizationInfo &LI) {
+  const LocalizationInfo::GlobalSetTy &Globals = LI.getGlobals();
+  typedef LocalizationInfo::GlobalSetTy::const_iterator IteratorTy;
+
+  SmallDenseMap<Value *, Value *> GlobalsToReplace;
+  Function *Fn = LI.getFunction();
+  for (IteratorTy I = Globals.begin(), E = Globals.end(); I != E; ++I) {
+    GlobalVariable *GV = (*I);
+    LLVM_DEBUG(dbgs() << "Localizing global: " << *GV);
+
+    Instruction &FirstI = *Fn->getEntryBlock().begin();
+    Type *ElemTy = GV->getType()->getElementType();
+    AllocaInst *Alloca = new AllocaInst(ElemTy, 0 /*AddressSpace*/,
+                                        GV->getName() + ".local", &FirstI);
+    Alloca->setAlignment(MaybeAlign(GV->getAlignment()));
+    if (!isa<UndefValue>(GV->getInitializer()))
+      new StoreInst(GV->getInitializer(), Alloca, &FirstI);
+
+    GlobalsToReplace.insert(std::make_pair(GV, Alloca));
+  }
+
+  // Replaces all globals uses within this function.
+  auto PhiUseIssueInsts = replaceUsesWithinFunction(GlobalsToReplace, Fn);
+
+  for (auto InstWithPhiUse : PhiUseIssueInsts) {
+    fixPhiUseIssue(InstWithPhiUse);
+  }
+}
+
+CallGraphNode *CMABI::ProcessNode(CallGraphNode *CGN) {
+  Function *F = CGN->getFunction();
+
+  // nothing to do for declarations or already visited functions.
+  if (!F || F->isDeclaration() || AlreadyVisited.count(F))
+    return 0;
+
+  // Variables to be localized.
+  LocalizationInfo &LI = getLocalizationInfo(F);
+
+  // This is a kernel.
+  if (Kernels.count(F)) {
+    // Localize globals for kernels.
+    if (!LI.getGlobals().empty())
+      LocalizeGlobals(LI);
+
+    // Check whether there are i1 or vxi1 kernel arguments.
+    for (auto AI = F->arg_begin(), AE = F->arg_end(); AI != AE; ++AI)
+      if (AI->getType()->getScalarType()->isIntegerTy(1))
+        return TransformKernel(F);
+
+    // No changes to this kernel's prototype.
+    return 0;
+  }
+
+  // Non-kernels, only transforms module locals.
+  if (!F->hasLocalLinkage())
+    return 0;
+
+  SmallVector<Argument*, 16> PointerArgs;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
+    if (I->getType()->isPointerTy())
+      PointerArgs.push_back(I);
+
+  // Check if there is any pointer arguments or globals to localize.
+  if (PointerArgs.empty() && LI.empty())
+    return 0;
+
+  // Check transformable arguments.
+  SmallPtrSet<Argument*, 8> ArgsToTransform;
+  for (unsigned i = 0, e = PointerArgs.size(); i != e; ++i) {
+    Argument *PtrArg = PointerArgs[i];
+    Type *ArgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+
+  // Only transform to simple types.
+  if ((F->getNumUses() > MaxCallSites || ArgTy->isVectorTy() || IsPtrArgModified(PtrArg)) &&
+    (ArgTy->isIntOrIntVectorTy() || ArgTy->isFPOrFPVectorTy()))
+    ArgsToTransform.insert(PtrArg);
+  }
+
+  if (ArgsToTransform.empty() && LI.empty())
+    return 0;
+
+  return TransformNode(F, ArgsToTransform, LI);
+}
+
+// check for typical inst sequences passing arg as a base
+// of store-like intrinsics
+static bool checkSinkToMemIntrinsic(Instruction *Inst) {
+  auto *CI = dyn_cast<CallInst>(Inst);
+  if (CI && (GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+                 GenXIntrinsic::genx_svm_scatter ||
+             GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) ==
+                 GenXIntrinsic::genx_scatter_scaled))
+    return true;
+  for (auto *U : Inst->users()) {
+    if (isa<InsertElementInst>(U) || isa<ShuffleVectorInst>(U) ||
+        isa<BinaryOperator>(U) || isa<CallInst>(U))
+      return checkSinkToMemIntrinsic(cast<Instruction>(U));
+  }
+  return false;
+}
+
+bool CMABI::IsPtrArgModified(Value *Arg) {
+  // Arg is a ptr to a vector type. If data is written using a
+  // store, then return true. This means copy-in/copy-out are
+  // needed as caller may use the updated value. If no data is
+  // ever stored in Arg then return false. It is safe to
+  // convert the parameter to pass-by-value in GRF.
+  // This is a recursive function.
+  for (const auto &U : Arg->users()) {
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      if (isa<StoreInst>(U))
+        return true;
+      else if (isa<AddrSpaceCastInst>(U) || isa<GetElementPtrInst>(U))
+        return IsPtrArgModified(U);
+      else if (isa<PtrToIntInst>(U))
+        return checkSinkToMemIntrinsic(I);
+    }
+  }
+  return false;
+}
+
+// \brief Fix argument passing for kernels: i1 -> i8.
+CallGraphNode *CMABI::TransformKernel(Function *F) {
+  assert(F->getReturnType()->isVoidTy());
+  LLVMContext &Context = F->getContext();
+
+  AttributeList AttrVec;
+  const AttributeList &PAL = F->getAttributes();
+
+  // First, determine the new argument list
+  SmallVector<Type *, 8> ArgTys;
+  unsigned ArgIndex = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgIndex) {
+    Type *ArgTy = I->getType();
+    // Change i1 to i8 and vxi1 to vxi8
+    if (ArgTy->getScalarType()->isIntegerTy(1)) {
+      Type *Ty = IntegerType::get(F->getContext(), 8);
+      if (ArgTy->isVectorTy())
+        ArgTys.push_back(VectorType::get(Ty, ArgTy->getVectorNumElements()));
+      else
+        ArgTys.push_back(Ty);
+    } else {
+      // Unchanged argument
+      AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
+      if (attrs.hasAttributes()) {
+        AttrBuilder B(attrs);
+        AttrVec = AttrVec.addParamAttributes(Context, ArgTys.size(), B);
+      }
+      ArgTys.push_back(I->getType());
+    }
+  }
+
+  FunctionType *NFTy = FunctionType::get(F->getReturnType(), ArgTys, false);
+  assert((NFTy != F->getFunctionType()) &&
+         "type out of sync, expect bool arguments");
+
+  // Add any function attributes.
+  AttributeSet FnAttrs = PAL.getFnAttributes();
+  if (FnAttrs.hasAttributes()) {
+    AttrBuilder B(FnAttrs);
+    AttrVec = AttrVec.addAttributes(Context, AttributeList::FunctionIndex, B);
+  }
+
+  // Create the new function body and insert it into the module.
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  NF->setAttributes(AttrVec);
+  LLVM_DEBUG(dbgs() << "CMABI:  Transforming to:" << *NF << "\n" << "From: " << *F);
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+  NF->takeName(F);
+  NF->setSubprogram(F->getSubprogram()); // tranfer debug-info
+  NF->setCallingConv(F->getCallingConv());
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+                              I2 = NF->arg_begin();
+       I != E; ++I, ++I2) {
+    // For an unmodified argument, move the name and users over.
+    if (!I->getType()->getScalarType()->isIntegerTy(1)) {
+      I->replaceAllUsesWith(I2);
+      I2->takeName(I);
+    } else {
+      Instruction *InsertPt = &*(NF->begin()->begin());
+      Instruction *Conv = new TruncInst(I2, I->getType(), "tobool", InsertPt);
+      I->replaceAllUsesWith(Conv);
+      I2->takeName(I);
+    }
+  }
+
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
+
+  // Update the metadata entry.
+  if (F->hasDLLExportStorageClass())
+    NF->setDLLStorageClass(F->getDLLStorageClass());
+
+  auto getValue = [](Metadata *M) -> Value * {
+    if (auto VM = dyn_cast<ValueAsMetadata>(M))
+      return VM->getValue();
+    return nullptr;
+  };
+
+  // Scan the CM kernel metadata and replace with NF.
+  if (NamedMDNode *Named =
+          CG.getModule().getNamedMetadata(genx::FunctionMD::GenXKernels)) {
+    for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) {
+      MDNode *Node = Named->getOperand(I);
+      if (F == dyn_cast_or_null<Function>(getValue(Node->getOperand(0))))
+        Node->replaceOperandWith(genx::KernelMDOp::FunctionRef, ValueAsMetadata::get(NF));
+    }
+  }
+
+  // Now that the old function is dead, delete it. If there is a dangling
+  // reference to the CallgraphNode, just leave the dead function around.
+  NF_CGN->stealCalledFunctionsFrom(CG[F]);
+  CallGraphNode *CGN = CG[F];
+  if (CGN->getNumReferences() == 0)
+    delete CG.removeFunctionFromModule(CGN);
+  else
+    F->setLinkage(Function::ExternalLinkage);
+
+  return NF_CGN;
+}
+
+// \brief Actually performs the transformation of the specified arguments, and
+// returns the new function.
+//
+// Note this transformation does change the semantics as a C function, due to
+// possible pointer aliasing. But it is allowed as a CM function.
+//
+// The pass-by-reference scheme is useful to copy-out values from the
+// subprogram back to the caller. It also may be useful to convey large inputs
+// to subprograms, as the amount of parameter conveying code will be reduced.
+// There is a restriction imposed on arguments passed by reference in order to
+// allow for an efficient CM implementation. Specifically the restriction is
+// that for a subprogram that uses pass-by-reference, the behavior must be the
+// same as if we use a copy-in/copy-out semantic to convey the
+// pass-by-reference argument; otherwise the CM program is said to be erroneous
+// and may produce incorrect results. Such errors are not caught by the
+// compiler and it is up to the user to guarantee safety.
+//
+// The implication of the above stated restriction is that no pass-by-reference
+// argument that is written to in a subprogram (either directly or transitively
+// by means of a nested subprogram call pass-by-reference argument) may overlap
+// with another pass-by-reference parameter or a global variable that is
+// referenced in the subprogram; in addition no pass-by-reference subprogram
+// argument that is referenced may overlap with a global variable that is
+// written to in the subprogram.
+//
+CallGraphNode *CMABI::TransformNode(Function *F,
+                                    SmallPtrSet<Argument *, 8> &ArgsToTransform,
+                                    LocalizationInfo &LI) {
+  // Computing a new prototype for the function. E.g.
+  //
+  // i32 @foo(i32, <8 x i32>*) becomes {i32, <8 x i32>} @bar(i32, <8 x i32>)
+  //
+  FunctionType *FTy = F->getFunctionType();
+  SmallVector<Type *, 8> RetTys;
+  if (!FTy->getReturnType()->isVoidTy())
+    RetTys.push_back(FTy->getReturnType());
+  auto SkipHeuristic = (F->getNumUses() > MaxCallSites);
+
+  // Keep track of parameter attributes for the arguments that we are *not*
+  // transforming. For the ones we do transform, parameter attributes are lost.
+  AttributeList AttrVec;
+  const AttributeList &PAL = F->getAttributes();
+  LLVMContext &Context = F->getContext();
+
+  // First, determine the new argument list
+  SmallVector<Type *, 8> Params;
+  SmallPtrSet<Argument*, 8> CopyInOutNeeded;
+  unsigned ArgIndex = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgIndex) {
+    if (!ArgsToTransform.count(I)) {
+      // Unchanged argument
+      AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
+      if (attrs.hasAttributes()) {
+        AttrBuilder B(attrs);
+        AttrVec = AttrVec.addParamAttributes(Context, Params.size(), B);
+      }
+      Params.push_back(I->getType());
+    } else if (I->use_empty()) {
+      // Delete unused arguments
+      ++NumArgumentsDead;
+    } else {
+      // Use the element type as the new argument type.
+      Params.push_back(I->getType()->getPointerElementType());
+
+      if (IsPtrArgModified(I) || SkipHeuristic) {
+        CopyInOutNeeded.insert(I);
+        RetTys.push_back(I->getType()->getPointerElementType());
+      }
+
+      ++NumArgumentsTransformed;
+    }
+  }
+
+  typedef LocalizationInfo::GlobalSetTy::iterator IteratorTy;
+  for (IteratorTy I = LI.getGlobals().begin(), E = LI.getGlobals().end();
+       I != E; ++I) {
+    GlobalVariable *GV = *I;
+    // Store the index information of this global variable.
+    LI.setArgIndex(GV, Params.size());
+
+    Type *PointeeTy = GV->getType()->getPointerElementType();
+    Params.push_back(PointeeTy);
+    RetTys.push_back(PointeeTy);
+  }
+
+  // Add any function attributes.
+  AttributeSet FnAttrs = PAL.getFnAttributes();
+  if (FnAttrs.hasAttributes()) {
+    AttrBuilder B(FnAttrs);
+    AttrVec = AttrVec.addAttributes(Context, AttributeList::FunctionIndex, B);
+  }
+
+  // Construct the new function type using the new arguments.
+  llvm::Type *RetTy = StructType::get(Context, RetTys);
+  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
+
+  // Create the new function body and insert it into the module.
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  NF->setAttributes(AttrVec);
+  LLVM_DEBUG(dbgs() << "CMABI:  Transforming to:" << *NF << "\n" << "From: " << *F);
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+  NF->takeName(F);
+  NF->setCallingConv(F->getCallingConv());
+
+  // Get a new callgraph node for NF.
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
+
+  std::vector<llvm::User*> DirectUsers;
+
+  for (auto U: F->users()) {
+    if (isa<CallInst>(U))
+      DirectUsers.push_back(U);
+  }
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in the loaded pointers.
+  for (auto U: DirectUsers) {
+    CallSite CS(U);
+    assert(CS.getCalledFunction() == F);
+    Instruction *Call = CS.getInstruction();
+    const AttributeList &CallPAL = CS.getAttributes();
+
+    SmallVector<Value*, 16> Args;
+    AttributeList NewAttrVec;
+
+    // Loop over the operands, inserting loads in the caller.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    ArgIndex = 0;
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+         ++I, ++AI, ++ArgIndex) {
+      if (!ArgsToTransform.count(I)) {
+        // Unchanged argument
+        AttributeSet attrs = CallPAL.getParamAttributes(ArgIndex);
+        if (attrs.hasAttributes()) {
+          AttrBuilder B(attrs);
+          NewAttrVec = NewAttrVec.addParamAttributes(Context, Args.size(), B);
+        }
+        Args.push_back(*AI);
+      } else if (!I->use_empty()) {
+        LoadInst *Load = new LoadInst(*AI, (*AI)->getName() + ".val", Call);
+        Args.push_back(Load);
+      }
+    }
+
+    // Push any varargs arguments on the list.
+    for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
+      AttributeSet attrs = CallPAL.getParamAttributes(ArgIndex);
+      if (attrs.hasAttributes()) {
+        AttrBuilder B(attrs);
+        NewAttrVec = NewAttrVec.addParamAttributes(Context, Args.size(), B);
+      }
+      Args.push_back(*AI);
+    }
+
+    // Push any localized globals.
+    for (IteratorTy I = LI.getGlobals().begin(), E = LI.getGlobals().end();
+         I != E; ++I) {
+      GlobalVariable *GV = *I;
+      LoadInst *Load = new LoadInst(GV, GV->getName() + ".val", Call);
+      Args.push_back(Load);
+    }
+
+    // Add any function attributes.
+    if (CallPAL.hasAttributes(AttributeList::FunctionIndex)) {
+      AttrBuilder B(CallPAL.getFnAttributes());
+      NewAttrVec = NewAttrVec.addAttributes(Context, AttributeList::FunctionIndex, B);
+    }
+
+    if (isa<InvokeInst>(Call))
+      llvm_unreachable("InvokeInst not supported");
+
+    CallInst *New = CallInst::Create(NF, Args, "", Call);
+    New->setCallingConv(CS.getCallingConv());
+    New->setAttributes(NewAttrVec);
+    if (cast<CallInst>(Call)->isTailCall())
+      New->setTailCall();
+    New->setDebugLoc(Call->getDebugLoc());
+
+    // Update the callgraph to know that the callsite has been transformed.
+    auto CalleeNode = static_cast<IGCLLVM::CallGraphNode *>(
+        CG[Call->getParent()->getParent()]);
+    CalleeNode->replaceCallEdge(CallSite(Call), New, NF_CGN);
+
+    unsigned Index = 0;
+    IRBuilder<> Builder(Call);
+
+    New->takeName(Call);
+    if (!F->getReturnType()->isVoidTy())
+      Call->replaceAllUsesWith(Builder.CreateExtractValue(New, Index++, "ret"));
+
+    // Loop over the operands, and copy out all pass by reference values.
+    AI = CS.arg_begin();
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+         ++I, ++AI) {
+      // Unused arguments are already eliminated from the call sites.
+      if (ArgsToTransform.count(I) && !I->use_empty() &&
+          CopyInOutNeeded.count(I)) {
+        Value *OutVal = Builder.CreateExtractValue(New, Index++);
+        Builder.CreateStore(OutVal, *AI);
+      }
+    }
+    // Loop over localized globals, and copy out all globals.
+    for (IteratorTy I = LI.getGlobals().begin(), E = LI.getGlobals().end();
+      I != E; ++I) {
+      GlobalVariable *GV = *I;
+      Value *OutVal = Builder.CreateExtractValue(New, Index++);
+      Builder.CreateStore(OutVal, GV);
+    }
+    assert(Index == New->getType()->getStructNumElements() && "type out of sync");
+
+    // Remove the old call from the function, reducing the use-count of F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Allocas used for transformed arguments.
+  SmallVector<AllocaInst *, 8> Allocas;
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+                              I2 = NF->arg_begin();
+       I != E; ++I) {
+    // For an unmodified argument, move the name and users over.
+    if (!ArgsToTransform.count(I)) {
+      I->replaceAllUsesWith(I2);
+      I2->takeName(I);
+      ++I2;
+      continue;
+    }
+
+    if (I->use_empty())
+      continue;
+
+    // Otherwise, we transformed this argument.
+    //
+    // In the callee, we create an alloca, and store each of the new incoming
+    // arguments into the alloca.
+    Instruction *InsertPt = &*(NF->begin()->begin());
+    Type *AgTy = I->getType()->getPointerElementType();
+    AllocaInst *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt);
+    Instruction * NewInst = TheAlloca;
+    if (I->getType()->getPointerAddressSpace() != 0) {
+      // Insert addrspace cast
+      auto AddrSpaceCast = new AddrSpaceCastInst(
+          TheAlloca, AgTy->getPointerTo(I->getType()->getPointerAddressSpace()),
+          "");
+      AddrSpaceCast->insertAfter(TheAlloca);
+      NewInst = AddrSpaceCast;
+    }
+    if (CopyInOutNeeded.count(I))
+      Allocas.push_back(TheAlloca);
+
+    I2->setName(I->getName());
+    new StoreInst(I2++, NewInst, InsertPt);
+
+    // Anything that used the arg should now use the alloca.
+    I->replaceAllUsesWith(NewInst);
+    NewInst->takeName(I);
+  }
+
+  // Collect all globals and their corresponding allocas.
+  SmallDenseMap<Value *, Value *> GlobalsToReplace;
+
+  // Loop over globals and transfer uses of globals over to new arguments.
+  for (IteratorTy I = LI.getGlobals().begin(), E = LI.getGlobals().end();
+       I != E; ++I) {
+    GlobalVariable *GV = *I;
+
+    Instruction *InsertPt = &*(NF->begin()->begin());
+    Type *AgTy = GV->getType()->getPointerElementType();
+    AllocaInst *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt);
+    Allocas.push_back(TheAlloca);
+
+    auto ArgIter = NF->arg_begin();
+    std::advance(ArgIter, LI.getArgIndex(GV));
+    ArgIter->setName(GV->getName() + ".in");
+    new StoreInst(ArgIter, TheAlloca, InsertPt);
+
+    TheAlloca->setName(GV->getName() + ".local");
+    GlobalsToReplace.insert(std::make_pair(GV, TheAlloca));
+  }
+  // Replaces all globals uses within this new function.
+  replaceUsesWithinFunction(GlobalsToReplace, NF);
+
+  // Fix all return instructions since we have changed the return type.
+  Type *NFRetTy = NF->getReturnType();
+  for (inst_iterator I = inst_begin(NF), E = inst_end(NF); I != E; /* empty */) {
+    Instruction *Inst = &*I++;
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(Inst)) {
+      IRBuilder<> Builder(RI);
+
+      // Create new return value, which is a struct type.
+      Value *RetVal = UndefValue::get(NFRetTy);
+      unsigned Index = 0;
+
+      if (!F->getReturnType()->isVoidTy()) {
+        Value *RV = RI->getReturnValue();
+        assert(RV && RV->getType()->isSingleValueType() && "type unexpected");
+        RetVal = Builder.CreateInsertValue(RetVal, RV, Index++);
+      }
+      for (unsigned i = 0, e = Allocas.size(); i < e; ++i) {
+        Value *V = Builder.CreateLoad(Allocas[i]);
+        RetVal = Builder.CreateInsertValue(RetVal, V, Index++);
+      }
+
+      StructType *ST = cast<StructType>(NFRetTy);
+      assert(ST->getNumElements() == Index && "type out of sync");
+      (void)ST;
+
+      // Return the final struct by value.
+      Builder.CreateRet(RetVal);
+      RI->eraseFromParent();
+    }
+  }
+
+  // It turns out sometimes llvm will recycle function pointers which confuses
+  // this pass. We delete its localization info and mark this function as
+  // already visited.
+  GlobalInfo.erase(F);
+  AlreadyVisited.insert(F);
+
+  NF_CGN->stealCalledFunctionsFrom(CG[F]);
+
+  // Now that the old function is dead, delete it. If there is a dangling
+  // reference to the CallgraphNode, just leave the dead function around.
+  CallGraphNode *CGN = CG[F];
+  if (CGN->getNumReferences() == 0)
+    delete CG.removeFunctionFromModule(CGN);
+  else
+    F->setLinkage(Function::ExternalLinkage);
+
+  return NF_CGN;
+}
+
+static void breakConstantVector(unsigned i, Instruction *CurInst,
+                                Instruction *InsertPt) {
+  ConstantVector *CV = cast<ConstantVector>(CurInst->getOperand(i));
+
+  // Splat case.
+  if (auto S = dyn_cast_or_null<ConstantExpr>(CV->getSplatValue())) {
+    // Turn element into an instruction
+    auto Inst = S->getAsInstruction();
+    Inst->setDebugLoc(CurInst->getDebugLoc());
+    Inst->insertBefore(InsertPt);
+    Type *NewTy = VectorType::get(Inst->getType(), 1);
+    Inst = CastInst::Create(Instruction::BitCast, Inst, NewTy, "", CurInst);
+    Inst->setDebugLoc(CurInst->getDebugLoc());
+
+    // Splat this value.
+    IRBuilder<> Builder(InsertPt);
+    Value *NewVal = Builder.CreateVectorSplat(CV->getNumOperands(), Inst);
+
+    // Update i-th operand with newly created splat.
+    CurInst->setOperand(i, NewVal);
+  }
+
+  SmallVector<Value *, 8> Vals;
+  bool HasConstExpr = false;
+  for (unsigned j = 0, N = CV->getNumOperands(); j < N; ++j) {
+    Value *Elt = CV->getOperand(j);
+    if (auto CE = dyn_cast<ConstantExpr>(Elt)) {
+      auto Inst = CE->getAsInstruction();
+      Inst->setDebugLoc(CurInst->getDebugLoc());
+      Inst->insertBefore(InsertPt);
+      Vals.push_back(Inst);
+      HasConstExpr = true;
+    } else
+      Vals.push_back(Elt);
+  }
+
+  if (HasConstExpr) {
+    Value *Val = UndefValue::get(CV->getType());
+    IRBuilder<> Builder(InsertPt);
+    for (unsigned j = 0, N = CV->getNumOperands(); j < N; ++j)
+      Val = Builder.CreateInsertElement(Val, Vals[j], j);
+    CurInst->setOperand(i, Val);
+  }
+}
+
+static void breakConstantExprs(Function *F) {
+  for (po_iterator<BasicBlock *> i = po_begin(&F->getEntryBlock()),
+                                 e = po_end(&F->getEntryBlock());
+       i != e; ++i) {
+    BasicBlock *BB = *i;
+    // The effect of this loop is that we process the instructions in reverse
+    // order, and we re-process anything inserted before the instruction
+    // being processed.
+    for (Instruction *CurInst = BB->getTerminator(); CurInst;) {
+      PHINode *PN = dyn_cast<PHINode>(CurInst);
+      for (unsigned i = 0, e = CurInst->getNumOperands(); i < e; ++i) {
+        auto InsertPt = PN ? PN->getIncomingBlock(i)->getTerminator() : CurInst;
+        Value *Op = CurInst->getOperand(i);
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op)) {
+          Instruction *NewInst = CE->getAsInstruction();
+          NewInst->setDebugLoc(CurInst->getDebugLoc());
+          NewInst->insertBefore(CurInst);
+          CurInst->setOperand(i, NewInst);
+        } else if (isa<ConstantVector>(Op))
+          breakConstantVector(i, CurInst, InsertPt);
+      }
+      CurInst = CurInst == &BB->front() ? nullptr : CurInst->getPrevNode();
+    }
+  }
+}
+
+// For each function, compute the list of globals that need to be passed as
+// copy-in and copy-out arguments.
+void CMABI::AnalyzeGlobals(CallGraph &CG) {
+  Module &M = CG.getModule();
+  // assuming the device module is self-contained,
+  // set internal-linkage for global variables
+  // and functions so globla-DCE can remove them
+  // if there is no use in the module.
+  for (auto& Global : M.getGlobalList()) {
+    if (!Global.isDeclaration())
+      Global.setLinkage(GlobalValue::InternalLinkage);
+  }
+  for (auto& F : M.getFunctionList()) {
+    // __cm_intrinsic_impl_* could be used for emulation mul/div etc
+    if (GenXIntrinsic::getAnyIntrinsicID(&F) ==
+      GenXIntrinsic::not_any_intrinsic &&
+      !F.getName().contains("__cm_intrinsic_impl_") &&
+      !F.isDeclaration() && !F.hasDLLExportStorageClass())
+      F.setLinkage(GlobalValue::InternalLinkage);
+  }
+  // No global variables.
+  if (M.global_empty())
+    return;
+
+  // Store functions in a SetVector to keep order and make searching efficient.
+  SetVector<Function *> Funcs;
+  for (auto I = scc_begin(&CG), IE = scc_end(&CG); I != IE; ++I) {
+    const std::vector<CallGraphNode *> &SCCNodes = *I;
+    for (const CallGraphNode *Node : SCCNodes) {
+      Function *F = Node->getFunction();
+      if (F != nullptr && !F->isDeclaration()) {
+        Funcs.insert(F);
+        breakConstantExprs(F);
+      }
+    }
+  }
+  auto PrintIndexChecker = [](Use &IUI) {
+    CallInst *CI = dyn_cast<CallInst>(IUI.getUser());
+    if (!CI)
+      return false;
+    Function *Callee = CI->getCalledFunction();
+    if (!Callee)
+      return false;
+    unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(Callee);
+    return (IntrinID == GenXIntrinsic::genx_print_format_index);
+  };
+  auto UsesPrintChecker =  [PrintIndexChecker](const Use &UI) {
+    auto *User = UI.getUser();
+    return std::any_of(User->use_begin(), User->use_end(), PrintIndexChecker);
+  };
+  const auto &DL = M.getDataLayout();
+  auto ToLocalize = selectGlobalsToLocalize(
+      M.globals(), LocalizationLimit.getValue(),
+      [UsesPrintChecker](const GlobalVariable &GV) {
+        // don't localize global constant format string if it's used by print_index intrinsic
+        bool UsesPrintIndex = std::any_of(GV.use_begin(), GV.use_end(), UsesPrintChecker);
+        return (GV.hasAttribute(genx::FunctionMD::GenXVolatile) ||
+                UsesPrintIndex);
+      },
+      [&DL](const GlobalVariable &GV) { return calcGVWeight(GV, DL); });
+  for (auto I = Funcs.begin(), E = Funcs.end(); I != E; ++I) {
+    Function *Fn = *I;
+    LLVM_DEBUG(dbgs() << "Visiting " << Fn->getName() << "\n");
+
+    // Collect globals used directly.
+    for (GlobalVariable *GV : ToLocalize) {
+      for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end();
+           UI != UE; ++UI) {
+        Instruction *Inst = dyn_cast<Instruction>(UI->getUser());
+        // not used in this function.
+        if (!Inst || Inst->getParent()->getParent() != Fn)
+          continue;
+
+        // Find the global being used and populate this info.
+        for (unsigned i = 0, e = Inst->getNumOperands(); i < e; ++i) {
+          Value *Op = Inst->getOperand(i);
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op))
+            addDirectGlobal(Fn, GV);
+        }
+      }
+    }
+
+    // Collect globals used indirectly.
+    for (inst_iterator II = inst_begin(Fn), IE = inst_end(Fn); II != IE; ++II) {
+      Instruction *Inst = &*II;
+      // Ignore InvokeInst.
+      if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
+        // Ignore indirect calls
+        if (Function *Callee = CI->getCalledFunction()) {
+          // Collect all globals from its callee.
+          if (!Callee->isDeclaration())
+            addIndirectGlobal(Fn, Callee);
+        }
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * diagnoseOverlappingArgs : attempt to diagnose overlapping by-ref args
+ *
+ * The CM language spec says you are not allowed a call with two by-ref args
+ * that overlap. This is to give the compiler the freedom to implement with
+ * copy-in copy-out semantics or with an address register.
+ *
+ * This function attempts to diagnose code that breaks this restriction. For
+ * pointer args to the call, it attempts to track how values are loaded using
+ * the pointer (assumed to be an alloca of the temporary used for copy-in
+ * copy-out semantics), and how those values then get propagated through
+ * wrregions and stores. If any vector element in a wrregion or store is found
+ * that comes from more than one pointer arg, it is reported.
+ *
+ * This ignores variable index wrregions, and only traces through instructions
+ * with the same debug location as the call, so does not work with -g0.
+ */
+void CMABI::diagnoseOverlappingArgs(CallInst *CI)
+{
+  LLVM_DEBUG(dbgs() << "diagnoseOverlappingArgs " << *CI << "\n");
+  auto DL = CI->getDebugLoc();
+  if (!DL)
+    return;
+  std::map<Value *, SmallVector<uint8_t, 16>> ValMap;
+  SmallVector<Instruction *, 8> WorkList;
+  std::set<Instruction *> InWorkList;
+  std::set<std::pair<unsigned, unsigned>> Reported;
+  // Using ArgIndex starting at 1 so we can reserve 0 to mean "element does not
+  // come from any by-ref arg".
+  for (unsigned ArgIndex = 1, NumArgs = CI->getNumArgOperands();
+      ArgIndex <= NumArgs; ++ArgIndex) {
+    Value *Arg = CI->getOperand(ArgIndex - 1);
+    if (!Arg->getType()->isPointerTy())
+      continue;
+    LLVM_DEBUG(dbgs() << "arg " << ArgIndex << ": " << *Arg << "\n");
+    // Got a pointer arg. Find its loads (with the same debug loc).
+    for (auto ui = Arg->use_begin(), ue = Arg->use_end(); ui != ue; ++ui) {
+      auto LI = dyn_cast<LoadInst>(ui->getUser());
+      if (!LI || LI->getDebugLoc() != DL)
+        continue;
+      LLVM_DEBUG(dbgs() << "  " << *LI << "\n");
+      // For a load, create a map entry that says that every vector element
+      // comes from this arg.
+      unsigned NumElements = 1;
+      if (auto VT = dyn_cast<VectorType>(LI->getType()))
+        NumElements = VT->getNumElements();
+      auto Entry = &ValMap[LI];
+      Entry->resize(NumElements, ArgIndex);
+      // Add its users (with the same debug location) to the work list.
+      for (auto ui = LI->use_begin(), ue = LI->use_end(); ui != ue; ++ui) {
+        auto Inst = cast<Instruction>(ui->getUser());
+        if (Inst->getDebugLoc() == DL)
+          if (InWorkList.insert(Inst).second)
+            WorkList.push_back(Inst);
+      }
+    }
+  }
+  // Process the work list.
+  while (!WorkList.empty()) {
+    auto Inst = WorkList.back();
+    WorkList.pop_back();
+    InWorkList.erase(Inst);
+    LLVM_DEBUG(dbgs() << "From worklist: " << *Inst << "\n");
+    Value *Key = nullptr;
+    SmallVector<uint8_t, 8> TempVector;
+    SmallVectorImpl<uint8_t> *VectorToMerge = nullptr;
+    if (auto SI = dyn_cast<StoreInst>(Inst)) {
+      // Store: set the map entry using the store pointer as the key. It might
+      // be an alloca of a local variable, or a global variable.
+      // Strictly speaking this is not properly keeping track of what is being
+      // merged using load-wrregion-store for a non-SROAd local variable or a
+      // global variable. Instead it is just merging at the store itself, which
+      // is good enough for our purposes.
+      Key = SI->getPointerOperand();
+      VectorToMerge = &ValMap[SI->getValueOperand()];
+    } else if (auto BC = dyn_cast<BitCastInst>(Inst)) {
+      // Bitcast: calculate the new map entry.
+      Key = BC;
+      uint64_t OutElementSize =
+          BC->getType()->getScalarType()->getPrimitiveSizeInBits();
+      uint64_t InElementSize = BC->getOperand(0)
+                                   ->getType()
+                                   ->getScalarType()
+                                   ->getPrimitiveSizeInBits();
+      int LogRatio = countTrailingZeros(OutElementSize, ZB_Undefined) -
+                     countTrailingZeros(InElementSize, ZB_Undefined);
+      auto OpndEntry = &ValMap[BC->getOperand(0)];
+      if (!LogRatio)
+        VectorToMerge = OpndEntry;
+      else if (LogRatio > 0) {
+        // Result element type is bigger than input element type, so there are
+        // fewer result elements. Just use an arbitrarily chosen non-zero entry
+        // of the N input elements to set the 1 result element.
+        assert(!(OpndEntry->size() & ((1U << LogRatio) - 1)));
+        for (unsigned i = 0, e = OpndEntry->size(); i != e; i += 1U << LogRatio) {
+          unsigned FoundArgIndex = 0;
+          for (unsigned j = 0; j != 1U << LogRatio; ++j)
+            FoundArgIndex = std::max(FoundArgIndex, (unsigned)(*OpndEntry)[i + j]);
+          TempVector.push_back(FoundArgIndex);
+        }
+        VectorToMerge = &TempVector;
+      } else {
+        // Result element type is smaller than input element type, so there are
+        // multiple result elements per input element.
+        for (unsigned i = 0, e = OpndEntry->size(); i != e; ++i)
+          for (unsigned j = 0; j != 1U << -LogRatio; ++j)
+            TempVector.push_back((*OpndEntry)[i]);
+        VectorToMerge = &TempVector;
+      }
+    } else if (auto CI = dyn_cast<CallInst>(Inst)) {
+      if (auto CF = CI->getCalledFunction()) {
+        switch (GenXIntrinsic::getGenXIntrinsicID(CF)) {
+          default:
+            break;
+          case GenXIntrinsic::genx_wrregionf:
+          case GenXIntrinsic::genx_wrregioni:
+            // wrregion: As long as it is constant index, propagate the argument
+            // indices into the appropriate elements of the result.
+            if (auto IdxC = dyn_cast<Constant>(CI->getOperand(
+                    GenXIntrinsic::GenXRegion::WrIndexOperandNum))) {
+              unsigned Idx = 0;
+              if (!IdxC->isNullValue()) {
+                auto IdxCI = dyn_cast<ConstantInt>(IdxC);
+                if (!IdxCI) {
+                  LLVM_DEBUG(dbgs() << "Ignoring variable index wrregion\n");
+                  break;
+                }
+                Idx = IdxCI->getZExtValue();
+              }
+              Idx /= (CI->getType()->getScalarType()->getPrimitiveSizeInBits() / 8U);
+              // First copy the "old value" input to the map entry.
+              auto OpndEntry = &ValMap[CI->getOperand(
+                    GenXIntrinsic::GenXRegion::OldValueOperandNum)];
+              auto Entry = &ValMap[CI];
+              Entry->clear();
+              Entry->insert(Entry->begin(), OpndEntry->begin(), OpndEntry->end());
+              // Then copy the "new value" elements according to the region.
+              TempVector.resize(CI->getType()->getVectorNumElements(), 0);
+              int VStride = cast<ConstantInt>(CI->getOperand(
+                    GenXIntrinsic::GenXRegion::WrVStrideOperandNum))->getSExtValue();
+              unsigned Width = cast<ConstantInt>(CI->getOperand(
+                    GenXIntrinsic::GenXRegion::WrWidthOperandNum))->getZExtValue();
+              int Stride = cast<ConstantInt>(CI->getOperand(
+                    GenXIntrinsic::GenXRegion::WrStrideOperandNum))->getSExtValue();
+              OpndEntry = &ValMap[CI->getOperand(
+                    GenXIntrinsic::GenXRegion::NewValueOperandNum)];
+              unsigned NumElements = OpndEntry->size();
+              if (!NumElements)
+                break;
+              for (unsigned RowIdx = Idx, Row = 0, Col = 0,
+                    NumRows = NumElements / Width;; Idx += Stride, ++Col) {
+                if (Col == Width) {
+                  Col = 0;
+                  if (++Row == NumRows)
+                    break;
+                  Idx = RowIdx += VStride;
+                }
+                TempVector[Idx] = (*OpndEntry)[Row * Width + Col];
+              }
+              VectorToMerge = &TempVector;
+              Key = CI;
+            }
+            break;
+        }
+      }
+    }
+    if (!VectorToMerge)
+      continue;
+    auto Entry = &ValMap[Key];
+    LLVM_DEBUG(dbgs() << "Merging :";
+      for (unsigned i = 0; i != VectorToMerge->size(); ++i)
+        dbgs() << " " << (unsigned)(*VectorToMerge)[i];
+      dbgs() << "\ninto " << Key->getName() << ":";
+      for (unsigned i = 0; i != Entry->size(); ++i)
+        dbgs() << " " << (unsigned)(*Entry)[i];
+      dbgs() << "\n");
+    if (Entry->empty())
+      Entry->insert(Entry->end(), VectorToMerge->begin(), VectorToMerge->end());
+    else {
+      assert(VectorToMerge->size() == Entry->size());
+      for (unsigned i = 0; i != VectorToMerge->size(); ++i) {
+        unsigned ArgIdx1 = (*VectorToMerge)[i];
+        unsigned ArgIdx2 = (*Entry)[i];
+        if (ArgIdx1 && ArgIdx2 && ArgIdx1 != ArgIdx2) {
+          LLVM_DEBUG(dbgs() << "By ref args overlap: args " << ArgIdx1 << " and " << ArgIdx2 << "\n");
+          if (ArgIdx1 > ArgIdx2)
+            std::swap(ArgIdx1, ArgIdx2);
+          if (Reported.insert(std::pair<unsigned, unsigned>(ArgIdx1, ArgIdx2))
+                .second) {
+            // Not already reported.
+            DiagnosticInfoOverlappingArgs Err(CI, "by reference arguments "
+                + Twine(ArgIdx1) + " and " + Twine(ArgIdx2) + " overlap",
+                DS_Error);
+            Inst->getContext().diagnose(Err);
+          }
+        }
+        (*Entry)[i] = std::max((*Entry)[i], (*VectorToMerge)[i]);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "giving:";
+      for (unsigned i = 0; i != Entry->size(); ++i)
+        dbgs() << " " << (unsigned)(*Entry)[i];
+      dbgs() << "\n");
+    if (Key == Inst) {
+      // Not the case that we have a store and we are using the pointer as
+      // the key. In ther other cases that do a merge (bitcast and wrregion),
+      // add users to the work list as long as they have the same debug loc.
+      for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
+        auto User = cast<Instruction>(ui->getUser());
+        if (User->getDebugLoc() == DL)
+          if (InWorkList.insert(Inst).second)
+            WorkList.push_back(User);
+      }
+    }
+  }
+}
+
+/***********************************************************************
+ * DiagnosticInfoOverlappingArgs initializer from Instruction
+ *
+ * If the Instruction has a DebugLoc, then that is used for the error
+ * location.
+ * Otherwise, the location is unknown.
+ */
+DiagnosticInfoOverlappingArgs::DiagnosticInfoOverlappingArgs(Instruction *Inst,
+    const Twine &Desc, DiagnosticSeverity Severity)
+    : DiagnosticInfo(getKindID(), Severity), Line(0), Col(0)
+{
+  auto DL = Inst->getDebugLoc();
+  if (!DL) {
+    Filename = DL.get()->getFilename();
+    Line = DL.getLine();
+    Col = DL.getCol();
+  }
+  Description = Desc.str();
+}
+
+/***********************************************************************
+ * DiagnosticInfoOverlappingArgs::print : print the error/warning message
+ */
+void DiagnosticInfoOverlappingArgs::print(DiagnosticPrinter &DP) const
+{
+  std::string Loc(
+        (Twine(!Filename.empty() ? Filename : "<unknown>")
+        + ":" + Twine(Line)
+        + (!Col ? Twine() : Twine(":") + Twine(Col))
+        + ": ")
+      .str());
+  DP << Loc << Description;
+}
+
+
+char CMABI::ID = 0;
+INITIALIZE_PASS_BEGIN(CMABI, "cmabi", "Fix ABI issues for the genx backend", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(CMABI, "cmabi", "Fix ABI issues for the genx backend", false, false)
+
+Pass *llvm::createCMABIPass() { return new CMABI(); }
+
+namespace {
+
+// A well-formed passing argument by reference pattern.
+//
+// (Alloca)
+// %argref1 = alloca <8 x float>, align 32
+//
+// (CopyInRegion/CopyInStore)
+// %rdr = tail call <8 x float> @llvm.genx.rdregionf(<960 x float> %m, i32 0, i32 8, i32 1, i16 0, i32 undef)
+// call void @llvm.genx.vstore(<8 x float> %rdr, <8 x float>* %argref)
+//
+// (CopyOutRegion/CopyOutLoad)
+// %ld = call <8 x float> @llvm.genx.vload(<8 x float>* %argref)
+// %wr = call <960 x float> @llvm.genx.wrregionf(<960 x float> %m, <8 x float> %ld, i32 0, i32 8, i32 1, i16 0, i32 undef, i1 true)
+//
+struct ArgRefPattern {
+  // Alloca of this reference argument.
+  AllocaInst *Alloca;
+
+  // The input value
+  CallInst *CopyInRegion;
+  CallInst *CopyInStore;
+
+  // The output value
+  CallInst *CopyOutLoad;
+  CallInst *CopyOutRegion;
+
+  // Load and store instructions on arg alloca.
+  SmallVector<CallInst *, 8> VLoads;
+  SmallVector<CallInst *, 8> VStores;
+
+  explicit ArgRefPattern(AllocaInst *AI)
+      : Alloca(AI), CopyInRegion(nullptr), CopyInStore(nullptr),
+        CopyOutLoad(nullptr), CopyOutRegion(nullptr) {}
+
+  // Match a copy-in and copy-out pattern. Return true on success.
+  bool match(DominatorTree &DT, PostDominatorTree &PDT);
+  void process();
+};
+
+struct CMLowerVLoadVStore : public FunctionPass {
+  static char ID;
+  CMLowerVLoadVStore() : FunctionPass(ID) {
+    initializeCMLowerVLoadVStorePass(*PassRegistry::getPassRegistry());
+  }
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  virtual bool runOnFunction(Function &F) override;
+
+private:
+  bool promoteAllocas(Function &F);
+  bool lowerLoadStore(Function &F);
+};
+
+} // namespace
+
+char CMLowerVLoadVStore::ID = 0;
+INITIALIZE_PASS_BEGIN(CMLowerVLoadVStore, "CMLowerVLoadVStore",
+                      "Lower CM reference vector loads and stores", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(CMLowerVLoadVStore, "CMLowerVLoadVStore",
+                    "Lower CM reference vector loads and stores", false, false)
+
+
+bool CMLowerVLoadVStore::runOnFunction(Function &F) {
+  bool Changed = false;
+  Changed |= promoteAllocas(F);
+  Changed |= lowerLoadStore(F);
+  return Changed;
+}
+
+// Lower remaining vector load/store intrinsic calls into normal load/store
+// instructions.
+bool CMLowerVLoadVStore::lowerLoadStore(Function &F) {
+  auto M = F.getParent();
+  DenseMap<AllocaInst*, GlobalVariable*> AllocaMap;
+  // collect all the allocas that store the address of genx-volatile variable
+  for (auto& G : M->getGlobalList()) {
+    if (!G.hasAttribute("genx_volatile"))
+      continue;
+    std::vector<User*> WL;
+    for (auto UI = G.user_begin(); UI != G.user_end();) {
+      auto U = *UI++;
+      WL.push_back(U);
+    }
+
+    while (!WL.empty()) {
+      auto Inst = WL.back();
+      WL.pop_back();
+      if (auto CE = dyn_cast<ConstantExpr>(Inst)) {
+        for (auto UI = CE->user_begin(); UI != CE->user_end();) {
+          auto U = *UI++;
+          WL.push_back(U);
+        }
+      }
+      else if (auto CI = dyn_cast<CastInst>(Inst)) {
+        for (auto UI = CI->user_begin(); UI != CI->user_end();) {
+          auto U = *UI++;
+          WL.push_back(U);
+        }
+      }
+      else if (auto SI = dyn_cast<StoreInst>(Inst)) {
+        auto Ptr = SI->getPointerOperand()->stripPointerCasts();
+        if (auto PI = dyn_cast<AllocaInst>(Ptr)) {
+          AllocaMap[PI] = &G;
+        }
+      }
+    }
+  }
+
+  // lower all vload/vstore into normal load/store.
+  std::vector<Instruction *> ToErase;
+  for (Instruction &Inst : instructions(F)) {
+    if (GenXIntrinsic::isVLoadStore(&Inst)) {
+      auto *Ptr = Inst.getOperand(0);
+      if (GenXIntrinsic::isVStore(&Inst))
+        Ptr = Inst.getOperand(1);
+      auto AS0 = cast<PointerType>(Ptr->getType())->getAddressSpace();
+      Ptr = Ptr->stripPointerCasts();
+      auto GV = dyn_cast<GlobalVariable>(Ptr);
+      if (GV) {
+        if (!GV->hasAttribute("genx_volatile"))
+          GV = nullptr;
+      }
+      else if (auto LI = dyn_cast<LoadInst>(Ptr)) {
+        auto PV = LI->getPointerOperand()->stripPointerCasts();
+        if (auto PI = dyn_cast<AllocaInst>(PV)) {
+          if (AllocaMap.find(PI) != AllocaMap.end()) {
+            GV = AllocaMap[PI];
+          }
+        }
+      }
+      if (GV == nullptr) {
+        // change to load/store
+        IRBuilder<> Builder(&Inst);
+        if (GenXIntrinsic::isVStore(&Inst))
+          Builder.CreateStore(Inst.getOperand(0), Inst.getOperand(1));
+        else {
+          auto LI = Builder.CreateLoad(Inst.getOperand(0), Inst.getName());
+          LI->setDebugLoc(Inst.getDebugLoc());
+          Inst.replaceAllUsesWith(LI);
+        }
+        ToErase.push_back(&Inst);
+      }
+      else {
+        // change to vload/vstore that has the same address space as
+        // the global-var in order to clean up unnecessary addr-cast.
+        auto AS1 = GV->getType()->getAddressSpace();
+        if (AS0 != AS1) {
+          IRBuilder<> Builder(&Inst);
+          if (GenXIntrinsic::isVStore(&Inst)) {
+            auto PtrTy = cast<PointerType>(Inst.getOperand(1)->getType());
+            PtrTy = PointerType::get(PtrTy->getElementType(), AS1);
+            auto PtrCast = Builder.CreateAddrSpaceCast(Inst.getOperand(1), PtrTy);
+            Type* Tys[] = { Inst.getOperand(0)->getType(),
+                           PtrCast->getType() };
+            Value* Args[] = { Inst.getOperand(0), PtrCast };
+            Function* Fn = GenXIntrinsic::getGenXDeclaration(
+              F.getParent(), GenXIntrinsic::genx_vstore, Tys);
+            Builder.CreateCall(Fn, Args, Inst.getName());
+          }
+          else {
+            auto PtrTy = cast<PointerType>(Inst.getOperand(0)->getType());
+            PtrTy = PointerType::get(PtrTy->getElementType(), AS1);
+            auto PtrCast = Builder.CreateAddrSpaceCast(Inst.getOperand(0), PtrTy);
+            Type* Tys[] = { Inst.getType(), PtrCast->getType() };
+            Function* Fn = GenXIntrinsic::getGenXDeclaration(
+              F.getParent(), GenXIntrinsic::genx_vload, Tys);
+            Value* VLoad = Builder.CreateCall(Fn, PtrCast, Inst.getName());
+            Inst.replaceAllUsesWith(VLoad);
+          }
+          ToErase.push_back(&Inst);
+        }
+      }
+    }
+  }
+
+  for (auto Inst : ToErase) {
+    Inst->eraseFromParent();
+  }
+
+  return !ToErase.empty();
+}
+
+static bool isBitCastForLifetimeMarker(Value *V) {
+  if (!V || !isa<BitCastInst>(V))
+    return false;
+  for (auto U : V->users()) {
+    unsigned IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(U);
+    if (IntrinsicID != Intrinsic::lifetime_start &&
+        IntrinsicID != Intrinsic::lifetime_end)
+      return false;
+  }
+  return true;
+}
+
+// Check whether two values are bitwise identical.
+static bool isBitwiseIdentical(Value *V1, Value *V2) {
+  assert(V1 && V2 && "null value");
+  if (V1 == V2)
+    return true;
+  if (BitCastInst *BI = dyn_cast<BitCastInst>(V1))
+    V1 = BI->getOperand(0);
+  if (BitCastInst *BI = dyn_cast<BitCastInst>(V2))
+    V2 = BI->getOperand(0);
+
+  // Special case arises from vload/vstore.
+  if (GenXIntrinsic::isVLoad(V1) && GenXIntrinsic::isVLoad(V2)) {
+    auto L1 = cast<CallInst>(V1);
+    auto L2 = cast<CallInst>(V2);
+    // Check if loading from the same location.
+    if (L1->getOperand(0) != L2->getOperand(0))
+      return false;
+
+    // Check if this pointer is local and only used in vload/vstore.
+    Value *Addr = L1->getOperand(0);
+    if (!isa<AllocaInst>(Addr))
+      return false;
+    for (auto UI : Addr->users()) {
+      if (isa<BitCastInst>(UI)) {
+        for (auto U : UI->users()) {
+          unsigned IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(U);
+          if (IntrinsicID != Intrinsic::lifetime_start &&
+              IntrinsicID != Intrinsic::lifetime_end)
+            return false;
+        }
+      } else {
+        if (!GenXIntrinsic::isVLoadStore(UI))
+          return false;
+      }
+    }
+
+    // Check if there is no store to the same location in between.
+    if (L1->getParent() != L2->getParent())
+      return false;
+    BasicBlock::iterator I = L1->getParent()->begin();
+    for (; &*I != L1 && &*I != L2; ++I)
+      /*empty*/;
+    assert(&*I == L1 || &*I == L2);
+    auto IEnd = (&*I == L1) ? L2->getIterator() : L1->getIterator();
+    for (; I != IEnd; ++I) {
+      Instruction *Inst = &*I;
+      if (GenXIntrinsic::isVStore(Inst) && Inst->getOperand(1) == Addr)
+        return false;
+    }
+
+    // OK.
+    return true;
+  }
+
+  // Cannot prove.
+  return false;
+}
+
+bool ArgRefPattern::match(DominatorTree &DT, PostDominatorTree &PDT) {
+  assert(Alloca);
+  if (Alloca->use_empty())
+    return false;
+
+  // check if all users are load/store.
+  SmallVector<CallInst *, 8> Loads;
+  SmallVector<CallInst *, 8> Stores;
+  for (auto U : Alloca->users())
+    if (GenXIntrinsic::isVLoad(U))
+      Loads.push_back(cast<CallInst>(U));
+    else if (GenXIntrinsic::isVStore(U))
+      Stores.push_back(cast<CallInst>(U));
+    else if (isBitCastForLifetimeMarker(U))
+      continue;
+    else
+      return false;
+
+  if (Loads.empty() || Stores.empty())
+    return false;
+
+  // find a unique store that dominates all other users if exists.
+  auto Cmp = [&](CallInst *L, CallInst *R) { return DT.dominates(L, R); };
+  CopyInStore = *std::min_element(Stores.begin(), Stores.end(), Cmp);
+  CopyInRegion = dyn_cast<CallInst>(CopyInStore->getArgOperand(0));
+  if (!CopyInRegion || !CopyInRegion->hasOneUse() || !GenXIntrinsic::isRdRegion(CopyInRegion))
+    return false;
+
+  for (auto SI : Stores)
+    if (SI != CopyInStore && !Cmp(CopyInStore, SI))
+      return false;
+  for (auto LI : Loads)
+    if (LI != CopyInStore && !Cmp(CopyInStore, LI))
+      return false;
+
+  // find a unique load that post-dominates all other users if exists.
+  auto PostCmp = [&](CallInst *L, CallInst *R) {
+      BasicBlock *LBB = L->getParent();
+      BasicBlock *RBB = R->getParent();
+      if (LBB != RBB)
+          return PDT.dominates(LBB, RBB);
+
+      // Loop through the basic block until we find L or R.
+      BasicBlock::const_iterator I = LBB->begin();
+      for (; &*I != L && &*I != R; ++I)
+          /*empty*/;
+
+      return &*I == R;
+  };
+  CopyOutLoad = *std::min_element(Loads.begin(), Loads.end(), PostCmp);
+
+  // Expect copy-out load has one or zero use. It is possible there
+  // is no use as the region becomes dead after this subroutine call.
+  //
+  if (!CopyOutLoad->use_empty()) {
+    if (!CopyOutLoad->hasOneUse())
+      return false;
+    CopyOutRegion = dyn_cast<CallInst>(CopyOutLoad->user_back());
+    if (!GenXIntrinsic::isWrRegion(CopyOutRegion))
+      return false;
+  }
+
+  for (auto SI : Stores)
+    if (SI != CopyOutLoad && !PostCmp(CopyOutLoad, SI))
+      return false;
+  for (auto LI : Loads)
+    if (LI != CopyOutLoad && !PostCmp(CopyOutLoad, LI))
+      return false;
+
+  // Ensure read-in and write-out to the same region. It is possible that region
+  // collasping does not simplify region accesses completely.
+  // Probably we should assert on region descriptors.
+  if (CopyOutRegion &&
+      !isBitwiseIdentical(CopyInRegion->getOperand(0),
+                          CopyOutRegion->getOperand(0)))
+    return false;
+
+  // It should be OK to rewrite all loads and stores into the argref.
+  VLoads.swap(Loads);
+  VStores.swap(Stores);
+  return true;
+}
+
+void ArgRefPattern::process() {
+  // 'Spill' the base region into memory during rewriting.
+  IRBuilder<> Builder(Alloca);
+  Function *RdFn = CopyInRegion->getCalledFunction();
+  assert(RdFn);
+  Type *BaseAllocaTy = RdFn->getFunctionType()->getParamType(0);
+  AllocaInst *BaseAlloca = Builder.CreateAlloca(BaseAllocaTy, nullptr,
+                                                Alloca->getName() + ".refprom");
+
+  Builder.SetInsertPoint(CopyInRegion);
+  Builder.CreateStore(CopyInRegion->getArgOperand(0), BaseAlloca);
+
+  if (CopyOutRegion) {
+    Builder.SetInsertPoint(CopyOutRegion);
+    CopyOutRegion->setArgOperand(0, Builder.CreateLoad(BaseAlloca));
+  }
+
+  // Rewrite all stores.
+  for (auto ST : VStores) {
+    Builder.SetInsertPoint(ST);
+    Value *OldVal = Builder.CreateLoad(BaseAlloca);
+    // Always use copy-in region arguments as copy-out region
+    // arguments do not dominate this store.
+    auto M = ST->getParent()->getParent()->getParent();
+    Value *Args[] = {OldVal,
+                     ST->getArgOperand(0),
+                     CopyInRegion->getArgOperand(1), // vstride
+                     CopyInRegion->getArgOperand(2), // width
+                     CopyInRegion->getArgOperand(3), // hstride
+                     CopyInRegion->getArgOperand(4), // offset
+                     CopyInRegion->getArgOperand(5), // parent width
+                     ConstantInt::getTrue(Type::getInt1Ty(M->getContext()))};
+    auto ID = OldVal->getType()->isFPOrFPVectorTy() ? GenXIntrinsic::genx_wrregionf
+                                                    : GenXIntrinsic::genx_wrregioni;
+    Type *Tys[] = {Args[0]->getType(), Args[1]->getType(), Args[5]->getType(),
+                   Args[7]->getType()};
+    auto WrFn = GenXIntrinsic::getGenXDeclaration(M, ID, Tys);
+    Value *NewVal = Builder.CreateCall(WrFn, Args);
+    Builder.CreateStore(NewVal, BaseAlloca);
+    ST->eraseFromParent();
+  }
+
+  // Rewrite all loads
+  for (auto LI : VLoads) {
+    if (LI->use_empty())
+      continue;
+
+    Builder.SetInsertPoint(LI);
+    Value *SrcVal = Builder.CreateLoad(BaseAlloca);
+    SmallVector<Value *, 8> Args(CopyInRegion->arg_operands());
+    Args[0] = SrcVal;
+    Value *Val = Builder.CreateCall(RdFn, Args);
+    LI->replaceAllUsesWith(Val);
+    LI->eraseFromParent();
+  }
+}
+
+// Allocas that are used in reference argument passing may be promoted into the
+// base region.
+bool CMLowerVLoadVStore::promoteAllocas(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  bool Modified = false;
+
+  SmallVector<AllocaInst *, 8> Allocas;
+  for (auto &Inst : F.front().getInstList()) {
+    if (auto AI = dyn_cast<AllocaInst>(&Inst))
+      Allocas.push_back(AI);
+    else
+      break;
+  }
+
+  for (auto AI : Allocas) {
+    ArgRefPattern ArgRef(AI);
+    if (ArgRef.match(DT, PDT)) {
+      ArgRef.process();
+      Modified = true;
+    }
+  }
+
+  return Modified;
+}
+
+Pass *llvm::createCMLowerVLoadVStorePass() { return new CMLowerVLoadVStore; }
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMImpParam.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMImpParam.cpp
new file mode 100644
index 000000000000..67ef5b5541e7
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMImpParam.cpp
@@ -0,0 +1,701 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+//===----------------------------------------------------------------------===//
+//
+/// CMImpParam
+/// ----------
+/// 
+/// As well as explicit kernel args declared in the CM kernel function, certain
+/// implicit args are also passed. These fall into two categories:
+/// 
+/// 1. fields set up in r0 by the hardware, depending on which dispatch method
+///    is being used (e.g. media walker);
+/// 
+/// 2. implicit args set up along with the explicit args in CURBE by the CM
+///    runtime.
+/// 
+/// The r0 implicit args are represented in LLVM IR by special intrinsics, and the
+/// GenX backend generates these to special reserved vISA registers.
+/// 
+/// For the CM runtime implicit args in (2) above, in vISA 3.2 and earlier, these were also
+/// represented by LLVM special intrinsics and vISA special reserved vISA registers.
+/// Because they are specific to the CM runtime, and not any other user of vISA,
+/// vISA 3.3 has removed them, and instead they are handled much like other kernel
+/// args in the input table.
+///
+/// The *kind* byte in the input table has two fields:
+///
+/// * the *category* field, saying whether the input is general/surface/etc;
+///
+/// * the *provenance* field, saying whether the input is an explicit one from
+///   the CM source, or an implicit one generated by this pass. This is a
+///   protocol agreed between the CM compiler (in fact this pass) and the CM
+///   runtime.
+/// 
+/// Within the CM compiler, the vISA input table for a kernel is represented by an
+/// array of kind bytes, each one corresponding to an argument of the kernel function.
+/// 
+/// Clang codegen still generates special intrinsics for these CM runtime implicit
+/// args. It is the job of this CMImpParam pass to transform those intrinsics:
+/// 
+/// * where the intrinsic for a CM runtime implicit arg is used somewhere:
+/// 
+///   - a global variable is created for it;
+///     
+///   - for any kernel that uses the implicit arg (or can reach a subroutine that
+///     uses it), the implicit arg is added to the input table in the kernel
+///     metadata and as an extra arg to the definition of the kernel itself,
+///     and its value is stored into the global variable;
+/// 
+/// * each use of the intrinsic for a CM runtime implicit arg is transformed into
+///   a load of the corresponding global variable.
+/// 
+/// Like any other global variable, the subsequent CMABI pass turns the global
+/// variable for an implicit arg into local variable(s) passed into subroutines
+/// if necessary.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "cmimpparam"
+#include "vc/GenXOpts/GenXOpts.h"
+#include "vc/GenXOpts/Utils/KernelInfo.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <set>
+#include <map>
+
+using namespace llvm;
+
+namespace llvm {
+void initializeCMImpParamPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+
+class ImplicitUseInfo {
+public:
+  typedef std::set<unsigned> ImplicitSetTy;
+
+  explicit ImplicitUseInfo(Function *F) : Fn(F) {}
+  ImplicitUseInfo() : Fn(nullptr) {}
+
+  Function *getFunction() const { return Fn; }
+
+  bool empty() const { return Implicits.empty(); }
+  ImplicitSetTy &getImplicits() { return Implicits; }
+  const ImplicitSetTy &getImplicits() const { return Implicits; }
+
+  // \brief Add an implicit arg intrinsic
+  void addImplicit(unsigned IID) { Implicits.insert(IID); }
+
+  void merge(const ImplicitUseInfo &IUI) {
+    Implicits.insert(IUI.Implicits.begin(), IUI.Implicits.end());
+  }
+
+  void dump() const { print(dbgs()); }
+
+  void print(raw_ostream &OS, unsigned depth = 0) const {
+    for (auto IID : Implicits) {
+      OS.indent(depth) << GenXIntrinsic::getAnyName(IID, None) << "\n";
+    }
+  }
+
+private:
+  // \brief The function being analyzed
+  Function *Fn;
+
+  // \brief Implicit arguments used
+  ImplicitSetTy Implicits;
+};
+
+struct CMImpParam : public ModulePass {
+  static char ID;
+  bool IsCmRT;
+
+  CMImpParam(bool isCmRT = true) : ModulePass(ID), IsCmRT(isCmRT) {
+    initializeCMImpParamPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<CallGraphWrapperPass>();
+  }
+
+  virtual StringRef getPassName() const { return "CM Implicit Params"; }
+
+  virtual bool runOnModule(Module &M);
+
+  void dump() const { print(dbgs()); }
+  virtual void print(raw_ostream &OS, const Module *M = nullptr) const;
+
+private:
+  void replaceWithGlobal(CallInst *CI, unsigned IID);
+  bool AnalyzeImplicitUse(Module &M);
+  void MergeImplicits(ImplicitUseInfo &implicits, Function *F);
+  void PropagateImplicits(Function *F, Module &M,
+                          ImplicitUseInfo &implicits);
+  CallGraphNode *ProcessKernel(Function *F);
+
+  static Value *getValue(Metadata *M) {
+    if (auto VM = dyn_cast<ValueAsMetadata>(M))
+      return VM->getValue();
+    return nullptr;
+  }
+
+  // Convert to implicit thread payload related intrinsics.
+  bool ConvertToOCLPayload(Module &M);
+
+  uint32_t MapToKind(unsigned IID) {
+    using namespace genx;
+    switch (IID) {
+      default:
+        return KernelMetadata::AK_NORMAL;
+      case GenXIntrinsic::genx_print_buffer:
+        return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_OCL_PRINTF_BUFFER;
+      case GenXIntrinsic::genx_local_size:
+        return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_LOCAL_SIZE;
+      case GenXIntrinsic::genx_local_id:
+      case GenXIntrinsic::genx_local_id16:
+        return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_LOCAL_ID;
+      case GenXIntrinsic::genx_group_count:
+        return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_GROUP_COUNT;
+      case GenXIntrinsic::genx_get_scoreboard_deltas:
+        return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_SB_DELTAS;
+      case GenXIntrinsic::genx_get_scoreboard_bti:
+        return KernelMetadata::AK_SURFACE | KernelMetadata::IMP_SB_BTI;
+      case GenXIntrinsic::genx_get_scoreboard_depcnt:
+        return KernelMetadata::AK_SURFACE | KernelMetadata::IMP_SB_DEPCNT;
+      case GenXIntrinsic::genx_local_id_x:
+        return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_OCL_LOCAL_ID_X;
+      case GenXIntrinsic::genx_local_id_y:
+        return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_OCL_LOCAL_ID_Y;
+      case GenXIntrinsic::genx_local_id_z:
+        return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_OCL_LOCAL_ID_Z;
+      case GenXIntrinsic::genx_group_or_local_size:
+        return KernelMetadata::AK_NORMAL |
+               KernelMetadata::IMP_OCL_GROUP_OR_LOCAL_SIZE;
+    }
+    return KernelMetadata::AK_NORMAL;
+  }
+
+  // \brief Returns the implicit use info associated with a function
+  ImplicitUseInfo &getImplicitUseInfo(Function *F) {
+    if (!ImplicitsInfo.count(F)) {
+      ImplicitUseInfo *IUI = new ImplicitUseInfo(F);
+      ImplicitsInfoObjs.push_back(IUI);
+      ImplicitsInfo[F] = IUI;
+      return *IUI;
+    }
+    return *ImplicitsInfo[F];
+  }
+
+  // \brief Returns the implict use info associated with a function (kernel)
+  // and also creates a new one that represents the total implicits for the
+  // kernel as a whole (stored in a different object)
+  ImplicitUseInfo &getImplicitUseInfoKernel(Function *F) {
+    assert(Kernels.count(F));
+
+    if (KernelsInfo.count(F)) {
+      // Kernel already processed
+      return *KernelsInfo[F];
+    }
+
+    ImplicitUseInfo *IUI = new ImplicitUseInfo(F);
+    ImplicitsInfoObjs.push_back(IUI);
+    KernelsInfo[F] = IUI;
+
+    if (ImplicitsInfo.count(F)) {
+      IUI->merge(*ImplicitsInfo[F]);
+    }
+
+    return *IUI;
+  }
+
+  const ImplicitUseInfo *implicitUseInfoKernelExist(Function *F) const {
+    if (KernelsInfo.count(F)) {
+      auto CI = KernelsInfo.find(F);
+      return CI->second;
+    }
+
+    return nullptr;
+  }
+
+  void addImplicit(Function *F, unsigned IID) {
+    getImplicitUseInfo(F).addImplicit(IID);
+  }
+
+  GlobalVariable *getIIDGlobal(Function *F, unsigned IID) {
+    if (GlobalsMap.count(IID))
+      return GlobalsMap[IID];
+
+    Type * Ty = getIntrinRetType(F->getContext(), IID);
+    assert(Ty);
+    GlobalVariable *NewVar = new GlobalVariable(
+        *F->getParent(), Ty, false,
+        GlobalVariable::InternalLinkage,
+        Constant::getNullValue(Ty),
+        "__imparg_" + GenXIntrinsic::getAnyName(IID, None));
+    GlobalsMap[IID] = NewVar;
+
+    return NewVar;
+  }
+
+  Type *getIntrinRetType(LLVMContext &Context, unsigned IID) {
+    switch (IID) {
+      case GenXIntrinsic::genx_print_buffer:
+        return llvm::Type::getInt64Ty(Context);
+      case GenXIntrinsic::genx_local_id:
+      case GenXIntrinsic::genx_local_size:
+      case GenXIntrinsic::genx_group_count:
+        return llvm::VectorType::get(llvm::Type::getInt32Ty(Context), 3);
+      case GenXIntrinsic::genx_local_id16:
+        return llvm::VectorType::get(llvm::Type::getInt16Ty(Context), 3);
+      default:
+        // Should be able to extract the type from the intrinsic
+        // directly as no overloading is required (if it is then
+        // you need to define specific type in a case statement above)
+        FunctionType *FTy = dyn_cast_or_null<FunctionType>(
+                                    GenXIntrinsic::getAnyType(Context, IID));
+        if (FTy)
+          return FTy->getReturnType();
+    }
+    return nullptr;
+  }
+
+  // This map captures all implicit uses to be transformed
+  SmallDenseMap<Function *, ImplicitUseInfo *> ImplicitsInfo;
+
+  // This map captures all implicit uses that are required for a kernel
+  // (includes sub function uses)
+  SmallDenseMap<Function *, ImplicitUseInfo *> KernelsInfo;
+
+  // All kernels (entry points) in module being processed
+  SmallPtrSet<Function *, 8> Kernels;
+
+  // Already visited functions
+  SmallPtrSet<Function *, 8> AlreadyVisited;
+
+  // ImplicitUseInfo objects created
+  SmallVector<ImplicitUseInfo *, 8> ImplicitsInfoObjs;
+
+  // Functions that contain implicit arg intrinsics
+  SmallPtrSet<Function *, 8> ContainImplicit;
+
+  // GlobalVariables that have been created for an intrinsic
+  SmallDenseMap<unsigned, GlobalVariable *> GlobalsMap;
+};
+
+} // namespace
+
+bool CMImpParam::runOnModule(Module &M) {
+  bool changed = false;
+
+  // Apply necessary changes if kernels are compiled for OpenCL runtime.
+  changed |= ConvertToOCLPayload(M);
+
+  // Analyze functions for implicit use intrinsic invocation
+  changed |= AnalyzeImplicitUse(M);
+
+  // Collect all CM kernels from named metadata and also traverse the call graph
+  // to determine what the total implicit uses are for the top level kernels
+  if (NamedMDNode *Named = M.getNamedMetadata(genx::FunctionMD::GenXKernels)) {
+    for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) {
+      MDNode *Node = Named->getOperand(I);
+      if (auto F = dyn_cast_or_null<Function>(
+              getValue(Node->getOperand(genx::KernelMDOp::FunctionRef)))) {
+        Kernels.insert(F);
+        AlreadyVisited.clear();
+        ImplicitUseInfo &implicits = getImplicitUseInfoKernel(F);
+        PropagateImplicits(F, M, implicits);
+        // for OCL/L0 RT we should unconditionally add
+        // implicit PRIVATE_BASE argument which is not supported on CM RT
+        if (!implicits.empty() || !IsCmRT) {
+          ProcessKernel(F);
+          changed |= true;
+        }
+      }
+    }
+  }
+  for (ImplicitUseInfo *Obj : ImplicitsInfoObjs)
+    delete Obj;
+
+  return changed;
+}
+
+// Replace the given instruction with a load from a global
+void CMImpParam::replaceWithGlobal(CallInst *CI, unsigned IID) {
+  GlobalVariable *GV = getIIDGlobal(CI->getParent()->getParent(), IID);
+  LoadInst *Load = new LoadInst(GV, GV->getName() + ".val", CI);
+  CI->replaceAllUsesWith(Load);
+}
+
+// For each function, see if it uses an intrinsic that in turn requires an
+// implicit kernel argument
+// (such as llvm.genx.local.size)
+bool CMImpParam::AnalyzeImplicitUse(Module &M) {
+  bool changed = false;
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Function *Fn = &*I;
+    LLVM_DEBUG(dbgs() << "AnalyzeImplicitUse visiting " << Fn->getName() << "\n");
+
+    bool implicitUse = false;
+
+    SmallVector<CallInst *, 8> ToErase;
+
+    // FIXME I think this should scan function declarations to find the implicit
+    // arg intrinsics, then scan their uses, instead of scanning the whole code
+    // to find calls to them.
+    for (inst_iterator II = inst_begin(Fn), IE = inst_end(Fn); II != IE; ++II) {
+      Instruction *Inst = &*II;
+      if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
+        if (Function *Callee = CI->getCalledFunction()) {
+          auto IID = GenXIntrinsic::getAnyIntrinsicID(Callee);
+          if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IID)) {
+            switch (IID) {
+              case GenXIntrinsic::genx_local_size:
+              case GenXIntrinsic::genx_local_id:
+              case GenXIntrinsic::genx_local_id16:
+              case GenXIntrinsic::genx_group_count:
+              case GenXIntrinsic::genx_get_scoreboard_deltas:
+              case GenXIntrinsic::genx_get_scoreboard_bti:
+              case GenXIntrinsic::genx_get_scoreboard_depcnt:
+              case GenXIntrinsic::genx_local_id_x:
+              case GenXIntrinsic::genx_local_id_y:
+              case GenXIntrinsic::genx_local_id_z:
+              case GenXIntrinsic::genx_group_or_local_size:
+              case GenXIntrinsic::genx_print_buffer:
+                LLVM_DEBUG(dbgs() << "AnalyzeImplicitUse found "
+                             << GenXIntrinsic::getGenXName((GenXIntrinsic::ID)IID, None));
+                addImplicit(Fn, IID);
+                implicitUse = true;
+
+                // Replace the intrinsic with a load of a global at this point
+                replaceWithGlobal(CI, IID);
+                ToErase.push_back(CI);
+                changed = true;
+                break;
+              default:
+                // Ignore (default added to prevent compiler warnings)
+                break;
+            }
+          }
+        }
+      }
+    }
+
+    for (auto CI : ToErase)
+      CI->eraseFromParent();
+
+    // Mark this function as containing an implicit use intrinsic
+    if (implicitUse)
+      ContainImplicit.insert(Fn);
+  }
+
+  return changed;
+}
+
+// Convert to implicit thread payload related intrinsics.
+bool CMImpParam::ConvertToOCLPayload(Module &M) {
+  // Check if this kernel is compiled for OpenCL runtime.
+  bool DoConversion = false;
+
+  if (NamedMDNode *Named = M.getNamedMetadata(genx::FunctionMD::GenXKernels)) {
+    for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) {
+      MDNode *Node = Named->getOperand(I);
+      auto F = dyn_cast_or_null<Function>(
+          getValue(Node->getOperand(genx::KernelMDOp::FunctionRef)));
+      if (F && (F->hasFnAttribute(genx::FunctionMD::OCLRuntime) || !IsCmRT)) {
+        DoConversion = true;
+        break;
+      }
+    }
+  }
+
+  if (!DoConversion)
+    return false;
+
+  bool Changed = false;
+  auto getFn = [=, &M](unsigned ID, Type *Ty) {
+    return M.getFunction(GenXIntrinsic::getAnyName(ID, Ty));
+  };
+
+  // Convert genx_local_id -> zext(genx_local_id16)
+  Type *Ty32 = VectorType::get(Type::getInt32Ty(M.getContext()), 3);
+  Type *Ty16 = VectorType::get(Type::getInt16Ty(M.getContext()), 3);
+  if (auto LIDFn = getFn(GenXIntrinsic::genx_local_id, Ty32)) {
+    Function *LID16 = GenXIntrinsic::getGenXDeclaration(
+        &M, GenXIntrinsic::genx_local_id16, Ty16);
+    for (auto UI = LIDFn->user_begin(); UI != LIDFn->user_end();) {
+      auto UInst = dyn_cast<Instruction>(*UI++);
+      if (UInst) {
+        IRBuilder<> Builder(UInst);
+        Value *Val = Builder.CreateCall(LID16);
+        Val = Builder.CreateZExt(Val, Ty32);
+        UInst->replaceAllUsesWith(Val);
+        UInst->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+// Merge implicit uses from the supplied function with implicit set passed in
+void CMImpParam::MergeImplicits(ImplicitUseInfo &implicits, Function *F) {
+  assert(ImplicitsInfo.count(F) && "Function not found in implicits info map");
+  auto IInfo = ImplicitsInfo[F];
+  implicits.merge(*IInfo);
+}
+
+// Determine if the named function uses any functions tagged with implicit use
+// in the call-graph
+void CMImpParam::PropagateImplicits(Function *F, Module &M,
+                                    ImplicitUseInfo &implicits) {
+  // Traverse the call graph from the Kernel to determine what implicits are
+  // used
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  
+  // If this node has already been processed then return immediately
+  if (AlreadyVisited.count(F))
+    return;
+
+  // Add this node to the already visited list
+  AlreadyVisited.insert(F);
+
+  // Start the traversal
+  CallGraphNode *N = CG[F];
+  // Inspect all children (recursive)
+  for (auto Children : *N) {
+    auto Func = Children.second->getFunction();
+    // Does this function have implicit arg use?
+    if (ContainImplicit.count(Func)) {
+      // Yes - add the implicits it contains to the set so far
+      MergeImplicits(implicits, Func);
+    }
+    // Also recursively process children of this node
+    PropagateImplicits(Func, M, implicits);
+  }
+}
+
+// Process a kernel - loads from a global (and the globals) have already been
+// added if required elsewhere (in doInitialization)
+// We've already determined that this is a kernel and that it requires some
+// implicit arguments adding
+CallGraphNode *CMImpParam::ProcessKernel(Function *F) {
+  LLVMContext &Context = F->getContext();
+  
+  assert(Kernels.count(F) && "ProcessKernel invoked on non-kernel CallGraphNode");
+
+  AttributeList AttrVec;
+  const AttributeList &PAL = F->getAttributes();
+  
+  // Determine the new argument list
+  SmallVector<Type *, 8> ArgTys;
+  
+  // First transfer all the explicit arguments from the old kernel
+  unsigned ArgIndex = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgIndex) {
+    ArgTys.push_back(I->getType());
+    AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
+    if (attrs.hasAttributes()) {
+      AttrBuilder B(attrs);
+      AttrVec = AttrVec.addParamAttributes(Context, ArgIndex, B);
+    }
+  }
+
+  bool UsesImplicits = KernelsInfo.count(F) > 0;
+
+  // Now add all the implicit arguments
+  if (UsesImplicits) {
+    ImplicitUseInfo *IUI = KernelsInfo[F];
+    for (unsigned IID : IUI->getImplicits()) {
+      ArgTys.push_back(getIntrinRetType(Context, IID));
+      // TODO: Might need to also add any attributes from the intrinsic at some
+      // point
+    }
+  }
+  if (!IsCmRT) {
+    // PRIVATE_BASE arg
+    ArgTys.push_back(Type::getInt64Ty(F->getContext()));
+  }
+  
+  FunctionType *NFTy = FunctionType::get(F->getReturnType(), ArgTys, false);
+  assert((NFTy != F->getFunctionType()) &&
+         "type out of sync, expect bool arguments)");
+  
+  // Add any function attributes
+  AttributeSet FnAttrs = PAL.getFnAttributes();
+  if (FnAttrs.hasAttributes()) {
+    AttrBuilder B(FnAttrs);
+    AttrVec = AttrVec.addAttributes(Context, AttributeList::FunctionIndex, B);
+  }
+
+  // Create new function body and insert into the module
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  NF->setAttributes(AttrVec);
+  LLVM_DEBUG(dbgs() << "CMImpParam: Transforming to: " << *NF << "\n" << "From: "
+        << *F);
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+  NF->takeName(F);
+  NF->setSubprogram(F->getSubprogram()); // tranfer debug-info
+  
+  // Now to splice the body of the old function into the new function
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+  
+  // Loop over the argument list, transferring uses of the old arguments to the
+  // new arguments, also tranferring over the names as well
+  Function::arg_iterator I2 = NF->arg_begin();
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++I2) {
+    I->replaceAllUsesWith(I2);
+    I2->takeName(I);
+  }
+  
+  // Get the insertion point ready for stores to globals
+  Instruction &FirstI = *NF->getEntryBlock().begin();
+  llvm::SmallVector<uint32_t, 8> ImpKinds;
+
+  if (UsesImplicits) {
+    ImplicitUseInfo *IUI = KernelsInfo[F];
+    for (unsigned IID : IUI->getImplicits()) {
+      // We known that for each IID implicit we've already added an arg
+      // Rename the arg to something more meaningful here
+      assert(I2 != NF->arg_end() &&
+             "fewer parameters for new function than expected");
+      I2->setName("__arg_" + GenXIntrinsic::getAnyName(IID, None));
+
+      // Also insert a new store at the start of the function to the global
+      // variable used for this implicit argument intrinsic
+      assert(GlobalsMap.count(IID) &&
+             "no global associated with this imp arg intrinsic");
+      new StoreInst(I2, GlobalsMap[IID], &FirstI);
+
+      // Prepare the kinds that will go into the metadata
+      ImpKinds.push_back(MapToKind(IID));
+
+      ++I2;
+    }
+  }
+  if (!IsCmRT) {
+    I2->setName("privBase");
+    ImpKinds.push_back(genx::KernelMetadata::AK_NORMAL |
+                       genx::KernelMetadata::IMP_OCL_PRIVATE_BASE);
+  }
+
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
+  
+  if (F->hasDLLExportStorageClass())
+    NF->setDLLStorageClass(F->getDLLStorageClass());
+  // Scan the CM kernel metadata and replace with NF
+  if (NamedMDNode *Named =
+          CG.getModule().getNamedMetadata(genx::FunctionMD::GenXKernels)) {
+    for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) {
+      MDNode *Node = Named->getOperand(I);
+      if (auto VM = dyn_cast_or_null<ValueAsMetadata>(
+              Node->getOperand(genx::KernelMDOp::FunctionRef))) {
+        if (F == VM->getValue()) {
+          Node->replaceOperandWith(genx::KernelMDOp::FunctionRef, ValueAsMetadata::get(NF));
+          llvm::SmallVector<llvm::Metadata *, 8> ArgKinds;
+
+          // Create a new MDNode of Kinds
+          // First add all the current Kinds for explicit operands
+          MDNode *TypeNode =
+              dyn_cast<MDNode>(Node->getOperand(genx::KernelMDOp::ArgKinds));
+          assert(TypeNode);
+          for (unsigned i = 0; i < TypeNode->getNumOperands(); ++i)
+            ArgKinds.push_back(TypeNode->getOperand(i));
+          for (uint32_t Kind : ImpKinds)
+            ArgKinds.push_back(ValueAsMetadata::getConstant(
+                ConstantInt::get(Type::getInt32Ty(Context), Kind)));
+          llvm::MDNode *Kinds = llvm::MDNode::get(Context, ArgKinds);
+          Node->replaceOperandWith(genx::KernelMDOp::ArgKinds, Kinds);
+        }
+      }
+    }
+  }
+
+  // Now that the old function is dead, delete it. If there is a dangling
+  // reference to the CallGraphNode, just leave the dead function around
+  NF_CGN->stealCalledFunctionsFrom(CG[F]);
+  CallGraphNode *CGN = CG[F];
+  if (CGN->getNumReferences() == 0)
+    delete CG.removeFunctionFromModule(CGN);
+  else
+    F->setLinkage(Function::ExternalLinkage);
+  
+  return NF_CGN;
+}
+
+void CMImpParam::print(raw_ostream &OS, const Module *M) const {
+  OS << "Kernels : \n";
+
+  for (auto Func : Kernels) {
+    OS.indent(4) << Func->getName() << "\n";
+
+    const ImplicitUseInfo *IUI = implicitUseInfoKernelExist(Func);
+    if (IUI)
+      IUI->print(OS, 8);
+  }
+
+  OS << "Functions with implicit arg intrinsics : \n";
+
+  for (auto FuncInfoPair : ImplicitsInfo) {
+    OS.indent(4) << FuncInfoPair.first->getName() << "\n";
+
+    FuncInfoPair.second->print(OS, 8);
+  }
+}
+
+
+char CMImpParam::ID = 0;
+INITIALIZE_PASS_BEGIN(CMImpParam, "cmimpparam",
+                      "Transformations required to support implicit arguments",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(CMImpParam, "cmimpparam",
+                    "Transformations required to support implicit arguments",
+                    false, false)
+
+Pass *llvm::createCMImpParamPass(bool IsCMRT) { return new CMImpParam(IsCMRT); }
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMKernelArgOffset.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMKernelArgOffset.cpp
new file mode 100644
index 000000000000..50d927561369
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMKernelArgOffset.cpp
@@ -0,0 +1,621 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+//===----------------------------------------------------------------------===//
+//
+/// CMKernelArgOffset
+/// -----------------
+///
+/// This pass determines the offset of each CM kernel argument, and adds it to
+/// the kernel metadata.
+///
+/// This pass also changes the linkage type for kernels, functions, and globals.
+/// assuming that functions and globals has no external exposure, therefore
+/// if not use, can be deleted by later GlobalDCE pass.
+///
+/// A CM kernel has metadata containing, amongst other things, an array of
+/// *kind* bytes, one byte per kernel argument, that will be output in the vISA
+/// kernel input table. This pass calculates the offset of each kernel argument,
+/// and adds an array to the kernel metadata containing the calculated offsets.
+///
+/// Argument offsets start at 32, as r0 is reserved by the various thread
+/// dispatch mechanisms.
+///
+/// The pass attempts to calculate the kernel argument offsets in a way that
+/// minimizes space wasted by holes.
+///
+/// The arguments are processed in three sets, with each (non-empty) set
+/// starting in a new GRF:
+///
+/// 1. explicit kernel arguments (i.e. ones that appeared in the CM source);
+///
+/// 2. implicit kernel (non-thread) arguments;
+///
+/// 3. implicit thread arguments.
+///
+/// These three sets need to be allocated as three separate chunks of whole GRF
+/// registers in that order by the CM runtime. In theory, the CM runtime can
+/// cope with the compiler creating a different ordering, but to do so it needs
+/// to create its own ordering and insert mov instructions at the start of the
+/// kernel, which is suboptimal. However, I am not clear whether that mechanism
+/// works, and it has not been tested.
+///
+/// There is a compiler option that can be used to disable argument re-ordering.
+/// This is for developers who are using the output asm files directly and want
+/// to control the argument order explicitly. The option is
+/// -enable-kernel-arg-reordering but is typically invoked as -mllvm
+/// -enable-kernel-arg-reordering=false (the default is true)
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "cmkernelargoffset"
+
+#include "llvmWrapper/Support/Alignment.h"
+
+#include "vc/GenXOpts/GenXOpts.h"
+#include "vc/GenXOpts/Utils/KernelInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/GenXIntrinsics/GenXMetadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    EnableKernelArgReordering("enable-kernel-arg-reordering", cl::init(true),
+                              cl::Hidden,
+                              cl::desc("Enable kernel argument reordering"));
+
+namespace llvm {
+void initializeCMKernelArgOffsetPass(PassRegistry &);
+}
+
+namespace {
+
+struct GrfParamZone {
+  unsigned Start;
+  unsigned End;
+  GrfParamZone(unsigned s, unsigned e) : Start(s), End(e){};
+};
+
+// Diagnostic information for error/warning from this pass.
+class DiagnosticInfoCMKernelArgOffset : public DiagnosticInfoOptimizationBase {
+private:
+  static int KindID;
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  static void emit(Instruction *Inst, StringRef Msg,
+                   DiagnosticSeverity Severity = DS_Error);
+  DiagnosticInfoCMKernelArgOffset(DiagnosticSeverity Severity,
+                                  const Function &Fn, const DebugLoc &DLoc,
+                                  StringRef Msg)
+      : DiagnosticInfoOptimizationBase((DiagnosticKind)getKindID(), Severity,
+                                       /*PassName=*/nullptr, Msg, Fn, DLoc) {}
+  // This kind of message is always enabled, and not affected by -rpass.
+  virtual bool isEnabled() const override { return true; }
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+int DiagnosticInfoCMKernelArgOffset::KindID = 0;
+
+// CMKernelArgOffset pass
+class CMKernelArgOffset : public ModulePass {
+  genx::KernelMetadata *KM = nullptr;
+
+  // Emit code for OCL runtime.
+  bool OCLCodeGen = false;
+
+public:
+  static char ID;
+  CMKernelArgOffset(unsigned GrfByteSize = 32, bool OCLCodeGen = false)
+      : ModulePass(ID), OCLCodeGen(OCLCodeGen), GrfByteSize(GrfByteSize) {
+    initializeCMKernelArgOffsetPass(*PassRegistry::getPassRegistry());
+    GrfMaxCount = 256;
+    GrfStartOffset = GrfByteSize;
+    GrfEndOffset = 128 * GrfByteSize;
+  }
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
+  virtual StringRef getPassName() const { return "CM kernel arg offset"; }
+  virtual bool runOnModule(Module &M);
+
+private:
+  void processKernel(MDNode *Node);
+  void processKernelOnOCLRT(MDNode *Node, Function *F);
+
+  static Value *getValue(Metadata *M) {
+    if (auto VM = dyn_cast<ValueAsMetadata>(M))
+      return VM->getValue();
+    return nullptr;
+  }
+
+  // Check whether there is an input/output argument attribute.
+  void checkArgKinds(Function *F) {
+    assert(KM && KM->isKernel());
+    for (unsigned i = 0, e = KM->getNumArgs(); i != e; ++i) {
+      auto IOKind = KM->getArgInputOutputKind(i);
+      // If there is input/output attribute, compiler will not freely reorder
+      // arguments.
+      if (IOKind != genx::KernelMetadata::IO_Normal) {
+        EnableKernelArgReordering = false;
+        break;
+      }
+    }
+  }
+
+  // Relayout thread paylod for OpenCL runtime.
+  bool enableOCLCodeGen() const { return OCLCodeGen; }
+
+  // Update offset MD node
+  void updateOffsetMD(MDNode *KernelMD,
+                      SmallDenseMap<Argument *, unsigned> &PlacedArgs) {
+    assert(KM);
+    Function *F = dyn_cast_or_null<Function>(
+        getValue(KernelMD->getOperand(genx::KernelMDOp::FunctionRef)));
+    assert(F && "nullptr kernel");
+
+    // All arguments now have offsets. Update the metadata node containing the
+    // offsets.
+    assert(F->arg_size() == KM->getNumArgs() &&
+           "Mismatch between metadata for kernel and number of args");
+    SmallVector<Metadata *, 8> ArgOffsets;
+    auto I32Ty = Type::getInt32Ty(F->getContext());
+    for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ++ai) {
+      Argument *Arg = &*ai;
+      ArgOffsets.push_back(ValueAsMetadata::getConstant(
+          ConstantInt::get(I32Ty, PlacedArgs[Arg])));
+    }
+    MDNode *OffsetsNode = MDNode::get(F->getContext(), ArgOffsets);
+    KernelMD->replaceOperandWith(genx::KernelMDOp::ArgOffsets, OffsetsNode);
+
+    // Give an error on too many arguments.
+    if (ArgOffsets.size() >= GrfMaxCount)
+      DiagnosticInfoCMKernelArgOffset::emit(&F->front().front(),
+                                            "Too many kernel arguments");
+  }
+
+  unsigned GrfByteSize;
+  unsigned GrfMaxCount;
+  unsigned GrfStartOffset;
+  unsigned GrfEndOffset;
+};
+
+} // namespace
+
+char CMKernelArgOffset::ID = 0;
+
+INITIALIZE_PASS_BEGIN(CMKernelArgOffset, "cmkernelargoffset",
+                      "CM kernel arg offset determination", false, false)
+INITIALIZE_PASS_END(CMKernelArgOffset, "cmkernelargoffset",
+                    "CM kernel arg offset determination", false, false)
+
+Pass *llvm::createCMKernelArgOffsetPass(unsigned GrfByteSize, bool OCLCodeGen) {
+  return new CMKernelArgOffset(GrfByteSize, OCLCodeGen);
+}
+
+/***********************************************************************
+ * runOnModule : run the CM kernel arg offset pass
+ */
+bool CMKernelArgOffset::runOnModule(Module &M) {
+  NamedMDNode *Named = M.getNamedMetadata(genx::FunctionMD::GenXKernels);
+  if (!Named)
+    return 0;
+
+  // Process each kernel in the CM kernel metadata.
+  for (unsigned i = 0, e = Named->getNumOperands(); i != e; ++i) {
+    MDNode *KernelNode = Named->getOperand(i);
+    if (KernelNode)
+      processKernel(KernelNode);
+  }
+
+  return true;
+}
+
+/***********************************************************************
+ * processKernel : process one kernel
+ *
+ * Enter:   Node = metadata node for one kernel
+ *
+ * See GenXMetadata.h for complete list of kernel metadata
+ */
+void CMKernelArgOffset::processKernel(MDNode *Node) {
+  Function *F = dyn_cast_or_null<Function>(
+      getValue(Node->getOperand(genx::KernelMDOp::FunctionRef)));
+  if (!F)
+    return;
+
+  // change the linkage attribute for the kernel
+  F->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
+
+  genx::KernelMetadata KM(F);
+  this->KM = &KM;
+  checkArgKinds(F);
+
+  // Layout kernel arguments differently if to run on OpenCL runtime.
+  if (enableOCLCodeGen()) {
+    return processKernelOnOCLRT(Node, F);
+  }
+
+  auto getTypeSizeInBytes = [=](Type *Ty) {
+    const DataLayout &DL = F->getParent()->getDataLayout();
+    if (auto PT = dyn_cast<PointerType>(Ty))
+      return DL.getPointerTypeSize(Ty);
+    return static_cast<unsigned>(Ty->getPrimitiveSizeInBits() / 8);
+  };
+
+  // setup kernel inputs, optionally reordering the assigned offsets for
+  // improved packing where appropriate. The reordering algorithm replicates
+  // that used in the legacy Cm compiler, as certain media walker applications
+  // seem sensitive to the way the kernel inputs are laid out.
+  SmallDenseMap<Argument *, unsigned> PlacedArgs;
+  unsigned Offset = 0;
+  if (EnableKernelArgReordering /*DoReordering*/) {
+    // Reorder kernel input arguments. Arguments are placed in size order,
+    // largest first (then in natural argument order where arguments are the
+    // same size). Each argument is placed at the lowest unused suitably
+    // aligned offset. So, in general big arguments are placed first with the
+    // smaller arguments being fit opportunistically into the gaps left
+    // between arguments placed earlier.
+    //
+    // Arguments that are at least one GRF in size must be aligned to a GRF
+    // boundary. Arguments smaller than a GRF must not cross a GRF boundary.
+    //
+    // FreeZones describes unallocated portions of the kernel input space,
+    // and is list of non-overlapping start-end pairs, ordered lowest first.
+    // Initially it consists of a single pair that describes the whole space
+
+    SmallVector<GrfParamZone, 16> FreeZones;
+    FreeZones.push_back(GrfParamZone(GrfStartOffset, GrfEndOffset));
+
+    // Repeatedly iterate over the arguments list, each time looking for the
+    // largest one that hasn't been processed yet.
+    // But ignore implicit args for now as they want to go after all the others.
+
+    do {
+      Argument *BestArg = nullptr;
+      unsigned BestSize;
+      unsigned BestElemSize;
+
+      auto ArgKinds = KM.getArgKinds();
+      auto Kind = ArgKinds.begin();
+      for (Function::arg_iterator i = F->arg_begin(), e = F->arg_end(); i != e;
+           ++i, ++Kind) {
+        Argument *Arg = &*i;
+        if (*Kind & 0xf8)
+          continue; // implicit arg
+
+        if (PlacedArgs.find(Arg) != PlacedArgs.end())
+          // Already done this one.
+          continue;
+
+        Type *Ty = Arg->getType();
+        unsigned Bytes = getTypeSizeInBytes(Ty);
+
+        if (BestArg == nullptr || BestSize < Bytes) {
+          BestArg = Arg;
+          BestSize = Bytes;
+          BestElemSize = getTypeSizeInBytes(Ty->getScalarType());
+        }
+      }
+
+      if (BestArg == nullptr)
+        // All done.
+        break;
+
+      // The best argument in this cycle has been found. Search FreeZones for
+      // a suitably sized and aligned gap.
+
+      unsigned Align;
+
+      if (BestSize > GrfByteSize)
+        Align = GrfByteSize;
+      else
+        Align = BestElemSize;
+
+      auto zi = FreeZones.begin();
+      auto ze = FreeZones.end();
+
+      unsigned Start = 0, End = 0;
+
+      for (; zi != ze; ++zi) {
+        GrfParamZone &Zone = *zi;
+
+        Start = alignTo(Zone.Start, Align);
+        End = Start + BestSize;
+
+        if ((Start % GrfByteSize) != 0 &&
+            (Start / GrfByteSize) != (End - 1) / GrfByteSize) {
+          Start = alignTo(Zone.Start, GrfByteSize);
+          End = Start + BestSize;
+        }
+
+        if (End <= Zone.End)
+          // Found one. This should never fail unless we have too many
+          // parameters to start with.
+          break;
+      }
+
+      assert(zi != ze &&
+             "unable to allocate argument offset (too many arguments?)");
+
+      // Exclude the found block from the free zones list. This may require
+      // that the found zone be split in two if the start of the block is
+      // not suitably aligned.
+
+      GrfParamZone &Zone = *zi;
+
+      if (Zone.Start == Start)
+        Zone.Start = End;
+      else {
+        unsigned NewEnd = Zone.End;
+        Zone.End = Start;
+        ++zi;
+        FreeZones.insert(zi, GrfParamZone(End, NewEnd));
+      }
+
+      PlacedArgs[BestArg] = Start;
+    } while (true);
+    // Now process the implicit args. First get the offset at the start of the
+    // last free zone. Process the implicit kernel args first, then the
+    // implicit thread args.
+    Offset = FreeZones.back().Start;
+    for (int WantThreadImplicit = 0; WantThreadImplicit != 2;
+         ++WantThreadImplicit) {
+      bool FirstThreadImplicit = WantThreadImplicit;
+      auto ArgKinds = KM.getArgKinds();
+      auto Kind = ArgKinds.begin();
+      for (Function::arg_iterator i = F->arg_begin(), e = F->arg_end(); i != e;
+           ++i, ++Kind) {
+        Argument *Arg = &*i;
+        if (!(*Kind & 0xf8))
+          continue;                               // not implicit arg
+        int IsThreadImplicit = (*Kind >> 3) == 3; // local_id
+        if (WantThreadImplicit != IsThreadImplicit)
+          continue;
+        Type *Ty = Arg->getType();
+        unsigned Bytes = Ty->getPrimitiveSizeInBits() / 8U;
+        unsigned Align = Ty->getScalarSizeInBits() / 8U;
+        // If this is the first thread implicit arg, put it in a new GRF.
+        if (FirstThreadImplicit)
+          Align = GrfByteSize;
+        FirstThreadImplicit = false;
+        Offset = alignTo(Offset, Align);
+        if ((Offset & (GrfByteSize - 1)) + Bytes > GrfByteSize) {
+          // GRF align if arg would cross GRF boundary
+          Offset = alignTo(Offset, GrfByteSize);
+        }
+        PlacedArgs[Arg] = Offset;
+        Offset += Bytes;
+      }
+    }
+  } else {
+    // No argument reordering. Arguments are placed at increasing offsets
+    // in their natural order, aligned according to their type.
+    //
+    // Again, arguments that are at least one GRF in size must be aligned to
+    // a GRF boundary. Arguments smaller than a GRF must not cross a GRF
+    // boundary.
+
+    // kernel input start offset
+    auto &DL = F->getParent()->getDataLayout();
+    Offset = GrfStartOffset;
+
+    // Place an argument and update offset.
+    // Arguments larger than a GRF must be at least GRF-aligned. Arguments
+    // smaller than a GRF may not cross GRF boundaries. This means that
+    // arguments cross a GRF boundary must be GRF aligned.
+    auto placeArg = [&](Argument *Arg, unsigned ByteSize, unsigned Align) {
+      Offset = alignTo(Offset, Align);
+      unsigned StartGRF = Offset / GrfByteSize;
+      unsigned EndGRF = (Offset + ByteSize - 1) / GrfByteSize;
+      if (StartGRF != EndGRF)
+        Offset = alignTo(Offset, GrfByteSize);
+      PlacedArgs[Arg] = Offset;
+      Offset += ByteSize;
+    };
+
+    for (auto &Arg : F->args()) {
+      Type *Ty = Arg.getType();
+      unsigned Bytes = 0, Alignment = 0;
+      if (Ty->isPointerTy()) {
+        Bytes = DL.getPointerTypeSize(Ty);
+        Alignment = IGCLLVM::getAlignmentValue(
+            DL.getPointerABIAlignment(Ty->getPointerAddressSpace()));
+      } else {
+        Bytes = Ty->getPrimitiveSizeInBits() / 8;
+        Alignment = IGCLLVM::getAlignmentValue(Ty->getScalarSizeInBits() / 8);
+      }
+      placeArg(&Arg, Bytes, Alignment);
+    }
+  }
+
+  // Update the offset MD node.
+  updateOffsetMD(Node, PlacedArgs);
+
+  this->KM = nullptr;
+}
+
+/***********************************************************************
+ * DiagnosticInfoCMKernelArgOffset::emit : emit an error or warning
+ */
+void DiagnosticInfoCMKernelArgOffset::emit(Instruction *Inst, StringRef Msg,
+                                           DiagnosticSeverity Severity) {
+  DiagnosticInfoCMKernelArgOffset Err(Severity, *Inst->getParent()->getParent(),
+                                      Inst->getDebugLoc(), Msg);
+  Inst->getContext().diagnose(Err);
+}
+
+void CMKernelArgOffset::processKernelOnOCLRT(MDNode *Node, Function *F) {
+  assert(KM);
+  // Assign BTI values.
+  {
+    unsigned Idx = 0;
+    auto ArgKinds = KM->getArgKinds();
+    auto Kind = ArgKinds.begin();
+    for (auto &Arg : F->args()) {
+      if (*Kind == genx::KernelMetadata::AK_SAMPLER ||
+          *Kind == genx::KernelMetadata::AK_SURFACE) {
+        int32_t BTI = KM->getBTI(Idx);
+        assert(BTI >= 0 && "unassigned BTI");
+
+        Type *Ty = Arg.getType();
+        if (Ty->isPointerTy()) {
+          SmallVector<Instruction *, 8> ToErase;
+
+          assert(Arg.hasOneUse() && "invalid surface input");
+          auto ArgUse = Arg.use_begin()->getUser();
+          assert(isa<PtrToIntInst>(ArgUse) && "invalid surface input usage");
+          ToErase.push_back(cast<Instruction>(ArgUse));
+
+          for (auto ui = ArgUse->use_begin(), ue = ArgUse->use_end(); ui != ue;
+               ++ui) {
+            auto User = cast<Instruction>(ui->getUser());
+            User->replaceAllUsesWith(
+                ConstantInt::get(User->getType(), BTI));
+            ToErase.push_back(User);
+          }
+
+          for (auto i = ToErase.rbegin(), e = ToErase.rend(); i != e; ++i)
+            (*i)->eraseFromParent();
+          ToErase.clear();
+        } else {
+          Arg.replaceAllUsesWith(ConstantInt::get(Arg.getType(), BTI));
+        }
+      }
+      ++Kind, ++Idx;
+    }
+  }
+
+  SmallDenseMap<Argument *, unsigned> PlacedArgs;
+  {
+    // OpenCL SIMD8 thread payloads are organized as follows:
+    //
+    //     0        1        2        3        4        5        6        7
+    // R0:          GX                                           GY       GZ
+    // R1: LIDx LIDy LIDz
+    //
+    unsigned Offset = GrfStartOffset;
+
+    unsigned ThreadPayloads[] = {
+        Offset // R1, local_id_x, local_id_y, local_id_z
+    };
+    auto getImpOffset = [&](genx::KernelArgInfo AI) -> int {
+      if (AI.isLocalIDs())
+        return ThreadPayloads[0];
+      return -1;
+    };
+
+    // Starting offsets for non-implicit arguments.
+    Offset += 1 * GrfByteSize;
+
+    // Place an argument and update offset.
+    // Arguments larger than a GRF must be at least GRF-aligned. Arguments
+    // smaller than a GRF may not cross GRF boundaries. This means that
+    // arguments cross a GRF boundary must be GRF aligned.
+    auto placeArg = [&](Argument *Arg, unsigned ByteSize, unsigned Align) {
+      Offset = alignTo(Offset, Align);
+      unsigned StartGRF = Offset / GrfByteSize;
+      unsigned EndGRF = (Offset + ByteSize - 1) / GrfByteSize;
+      if (StartGRF != EndGRF)
+        Offset = alignTo(Offset, GrfByteSize);
+      PlacedArgs[Arg] = Offset;
+      Offset += ByteSize;
+    };
+
+    // First scan, assign implicit arguments.
+    auto ArgKinds = KM->getArgKinds();
+    auto Kind = ArgKinds.begin();
+    for (auto &Arg : F->args()) {
+      genx::KernelArgInfo AI(*Kind++);
+      int ImpOffset = getImpOffset(AI);
+      if (ImpOffset > 0) {
+        PlacedArgs[&Arg] = ImpOffset;
+        continue;
+      }
+
+      if (AI.isLocalSize() || AI.isGroupCount() || AI.isPrintBuffer() ||
+          AI.isPrivateBase()) {
+        unsigned Bytes = Arg.getType()->getPrimitiveSizeInBits() / 8;
+        unsigned Align = Arg.getType()->getScalarSizeInBits() / 8;
+        placeArg(&Arg, Bytes, Align);
+      }
+    }
+
+    // Second scan, assign normal arguments.
+    Kind = ArgKinds.begin();
+    unsigned Idx = 0;
+    for (auto &Arg : F->args()) {
+      genx::KernelArgInfo AI(*Kind++);
+      bool IsBuffer = KM->isBufferType(Idx++);
+
+      // Skip alaready assigned arguments.
+      if (PlacedArgs.count(&Arg))
+        continue;
+
+      // image/sampler arguments do not allocate vISA inputs
+      // buffer arguments do allocate unused vISA inputs
+      if (!AI.isNormalCategory() && !IsBuffer) {
+        PlacedArgs[&Arg] = genx::KernelMetadata::SKIP_OFFSET_VAL;
+        continue;
+      }
+
+      Type *Ty = Arg.getType();
+      auto &DL = F->getParent()->getDataLayout();
+      unsigned Alignment = 0;
+      unsigned Bytes = 0;
+      if (IsBuffer) {
+        // Buffer is treated as stateless global pointer!
+        Bytes = DL.getPointerSize();
+        Alignment = IGCLLVM::getAlignmentValue(DL.getPointerABIAlignment(0));
+      } else if (Ty->isPointerTy()) {
+        Bytes = DL.getPointerTypeSize(Ty);
+        Alignment = IGCLLVM::getAlignmentValue(
+            DL.getPointerABIAlignment(Ty->getPointerAddressSpace()));
+      } else {
+        Bytes = Ty->getPrimitiveSizeInBits() / 8;
+        Alignment = IGCLLVM::getAlignmentValue(Ty->getScalarSizeInBits() / 8);
+      }
+      placeArg(&Arg, Bytes, Alignment);
+    }
+  }
+
+  updateOffsetMD(Node, PlacedArgs);
+}
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMRegion.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMRegion.cpp
new file mode 100644
index 000000000000..4fcb90e732cc
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMRegion.cpp
@@ -0,0 +1,925 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+//
+// Implementation of methods for CMRegion class
+//
+//===----------------------------------------------------------------------===//
+
+#include "vc/GenXOpts/Utils/CMRegion.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+// Find the datalayout if possible.
+const DataLayout *GetDL(Value *V) {
+  if (auto Inst = dyn_cast_or_null<Instruction>(V))
+    return &Inst->getParent()->getParent()->getParent()->getDataLayout();
+  if (auto Arg = dyn_cast_or_null<Argument>(V))
+      return &Arg->getParent()->getParent()->getDataLayout();
+  return nullptr;
+}
+
+/***********************************************************************
+ * Region constructor from a type
+ */
+CMRegion::CMRegion(Type *Ty, const DataLayout *DL)
+    : ElementBytes(0), NumElements(1), VStride(0), Width(1),
+      Stride(1), Offset(0), Indirect(0), IndirectIdx(0), Mask(0),
+      ParentWidth(0)
+{
+  assert(!Ty->isAggregateType() &&
+         "cannot create region based on an aggregate type");
+  ElementTy = Ty;
+  if (VectorType *VT = dyn_cast<VectorType>(ElementTy)) {
+    ElementTy = VT->getElementType();
+    NumElements = VT->getNumElements();
+    Width = NumElements;
+  }
+  if (DL) {
+    unsigned BitSize = DL->getTypeSizeInBits(ElementTy);
+    ElementBytes = alignTo<8>(BitSize) / 8;
+  } else {
+    unsigned BitSize = ElementTy->getPrimitiveSizeInBits();
+    ElementBytes = alignTo<8>(BitSize) / 8;
+    assert(ElementBytes && "Cannot compute element size without data layout");
+  }
+}
+
+/***********************************************************************
+ * Region constructor from a value
+ */
+CMRegion::CMRegion(Value *V, const DataLayout *DL)
+    : CMRegion(V->getType(), DL ? DL : GetDL(V)) {}
+
+/***********************************************************************
+ * Region constructor from a rd/wr region and its BaleInfo
+ * This also works with rdpredregion and wrpredregion, with Offset in
+ * bits rather than bytes, and with ElementBytes set to 1.
+ */
+CMRegion::CMRegion(Instruction *Inst, bool WantParentWidth)
+    : ElementBytes(0), NumElements(1), VStride(1), Width(1),
+      Stride(1), Offset(0), Indirect(0), IndirectIdx(0), Mask(0),
+      ParentWidth(0)
+{
+  // Determine where to get the subregion value from and which arg index
+  // the region parameters start at.
+  unsigned ArgIdx = 0;
+  Value *Subregion = 0;
+  assert(isa<CallInst>(Inst));
+  switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) {
+    case GenXIntrinsic::genx_rdpredregion:
+      NumElements = Inst->getType()->getVectorNumElements();
+      Width = NumElements;
+      Offset = cast<ConstantInt>(Inst->getOperand(1))->getZExtValue();
+      ElementBytes = 1;
+      return;
+    case GenXIntrinsic::genx_wrpredregion:
+      NumElements = Inst->getOperand(1)->getType()->getVectorNumElements();
+      Width = NumElements;
+      Offset = cast<ConstantInt>(Inst->getOperand(2))->getZExtValue();
+      ElementBytes = 1;
+      return;
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf:
+      ArgIdx = 1;
+      // The size/type of the region is given by the return value:
+      Subregion = Inst;
+      break;
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+    case GenXIntrinsic::genx_wrconstregion:
+      ArgIdx = 2;
+      // The size/type of the region is given by the "subregion value to
+      // write" operand:
+      Subregion = Inst->getOperand(1);
+      // For wrregion, while we're here, also get the mask. We set mask to NULL
+      // if the mask operand is constant 1 (i.e. not predicated).
+      Mask = Inst->getOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum);
+      if (auto C = dyn_cast<Constant>(Mask))
+        if (C->isAllOnesValue())
+          Mask = 0;
+      break;
+    default:
+      assert(0);
+  }
+  // Get the region parameters.
+  assert(Subregion);
+  ElementTy = Subregion->getType();
+  if (VectorType *VT = dyn_cast<VectorType>(ElementTy)) {
+    ElementTy = VT->getElementType();
+    NumElements = VT->getNumElements();
+  }
+  ElementBytes = ElementTy->getPrimitiveSizeInBits() / 8;
+  if (ElementTy->getPrimitiveSizeInBits())
+    ElementBytes = ElementBytes ? ElementBytes : 1;
+  VStride = cast<ConstantInt>(Inst->getOperand(ArgIdx))->getSExtValue();
+  Width = cast<ConstantInt>(Inst->getOperand(ArgIdx + 1))->getSExtValue();
+  Stride = cast<ConstantInt>(Inst->getOperand(ArgIdx + 2))->getSExtValue();
+  ArgIdx += 3;
+  // Get the start index.
+  Value *V = Inst->getOperand(ArgIdx);
+  assert(V->getType()->getScalarType()->isIntegerTy(16) &&
+         "region index must be i16 or vXi16 type");
+
+#if 0 // _DEBUG
+  // In one transform, this check does not work in the middle of transformation
+  if (VectorType *VT = dyn_cast<VectorType>(V->getType()))
+    assert(VT->getNumElements() * Width == NumElements &&
+           "vector region index size mismatch");
+#endif
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    Offset = CI->getSExtValue(); // Constant index.
+  else {
+    Indirect = V; // Index is variable; assume no baled in add.
+    // For a variable index, get the parent width arg.
+    ConstantInt *PW = dyn_cast<ConstantInt>(Inst->getOperand(ArgIdx + 1));
+    if (PW)
+      ParentWidth = PW->getZExtValue();
+  }
+  // We do some trivial legalization here. The legalization pass does not
+  // make these changes; instead we do them here so they are not permanently
+  // written back into the IR but are made on the fly each time some other
+  // pass uses this code to get the region info.
+  if (NumElements == 1) {
+    Width = Stride = 1;
+    VStride = 0;
+  } else {
+    if (NumElements <= Width) {
+      Width = NumElements;
+      VStride = 0;
+    } else if ((unsigned)VStride == Width * Stride) {
+      // VStride == Width * Stride, so we can canonicalize to a 1D region,
+      // but only if not indirect or not asked to preserve parentwidth,
+      // and never if multi-indirect.
+      if (!Indirect
+          || (!isa<VectorType>(Indirect->getType()) && !WantParentWidth)) {
+        Width = NumElements;
+        VStride = 0;
+        ParentWidth = 0;
+      }
+    } else if (Width == 1) {
+      // We can turn a 2D width 1 region into a 1D region, but if it is
+      // indirect it invalidates ParentWidth. So only do it if not asked
+      // to keep ParentWidth. Also we cannot do it if it is multi-indirect.
+      if (!Indirect
+          || (!isa<VectorType>(Indirect->getType()) && !WantParentWidth)) {
+        Width = NumElements;
+        Stride = VStride;
+        VStride = 0;
+        ParentWidth = 0;
+      }
+    }
+    if (Stride == 0 && Width == NumElements) {
+      // Canonical scalar region.
+      Width = 1;
+      VStride = 0;
+    }
+  }
+}
+
+/***********************************************************************
+ * Region constructor from bitmap of which elements to set
+ *
+ * Enter:   Bits = bitmap of which elements to set
+ *          ElementBytes = bytes per element
+ *
+ * It is assumed that Bits represents a legal 1D region.
+ */
+CMRegion::CMRegion(unsigned Bits, unsigned ElementBytes)
+    : ElementBytes(ElementBytes), ElementTy(0), NumElements(1), VStride(1),
+      Width(1), Stride(1), Offset(0), Indirect(0), IndirectIdx(0), Mask(0)
+{
+  assert(Bits);
+  Offset = countTrailingZeros(Bits, ZB_Undefined);
+  Bits >>= Offset;
+  Offset *= ElementBytes;
+  if (Bits != 1) {
+    Stride = countTrailingZeros(Bits & ~1, ZB_Undefined);
+    NumElements = Width = countPopulation(Bits);
+  }
+}
+
+/***********************************************************************
+ * CMRegion::getSubregion : modify Region struct for a subregion
+ *
+ * Enter:   StartIdx = start index of subregion (in elements)
+ *          Size = size of subregion (in elements)
+ *
+ * This does not modify the Mask; the caller needs to do that separately.
+ */
+void CMRegion::getSubregion(unsigned StartIdx, unsigned Size)
+{
+  if (Indirect && isa<VectorType>(Indirect->getType())) {
+    // Vector indirect (multi indirect). Set IndirectIdx to the index of
+    // the start element in the vector indirect.
+    IndirectIdx = StartIdx / Width;
+    StartIdx %= Width;
+  }
+  int AddOffset = StartIdx / Width * VStride;
+  AddOffset += StartIdx % Width * Stride;
+  AddOffset *= ElementBytes;
+  Offset += AddOffset;
+  if (!(StartIdx % Width) && !(Size % Width)) {
+    // StartIdx is at the start of a row and Size is a whole number of
+    // rows.
+  } else if (StartIdx % Width + Size > Width) {
+    // The subregion goes over a row boundary. This can only happen if there
+    // is only one row split and it is exactly in the middle.
+    VStride += (Size / 2 - Width) * Stride;
+    Width = Size / 2;
+  } else {
+    // Within a single row.
+    Width = Size;
+    VStride = Size * Stride;
+  }
+  NumElements = Size;
+}
+
+/***********************************************************************
+ * CMRegion::createRdRegion : create rdregion intrinsic from "this" Region
+ *
+ * Enter:   Input = vector value to extract subregion from
+ *          Name = name for new instruction
+ *          InsertBefore = insert new inst before this point
+ *          DL = DebugLoc to give the new instruction
+ *          AllowScalar = true to return scalar if region is size 1
+ *
+ * Return:  newly created instruction
+ */
+Instruction *CMRegion::createRdRegion(Value *Input, const Twine &Name,
+    Instruction *InsertBefore, const DebugLoc &DL, bool AllowScalar)
+{
+  assert(ElementBytes && "not expecting i1 element type");
+  auto OffsetInElem = Offset / ElementBytes;
+  (void)OffsetInElem;
+  assert(OffsetInElem >= 0 &&
+         OffsetInElem < Input->getType()->getVectorNumElements() &&
+         "initial offset is out of range of input vector");
+
+  Value *StartIdx = getStartIdx(Name, InsertBefore, DL);
+  IntegerType *I32Ty = Type::getInt32Ty(Input->getContext());
+  Value *ParentWidthArg = UndefValue::get(I32Ty);
+  if (Indirect)
+    ParentWidthArg = ConstantInt::get(I32Ty, ParentWidth);
+  Value *Args[] = {   // Args to new rdregion:
+      Input, // input to original rdregion
+      ConstantInt::get(I32Ty, VStride), // vstride
+      ConstantInt::get(I32Ty, Width), // width
+      ConstantInt::get(I32Ty, Stride), // stride
+      StartIdx, // start index (in bytes)
+      ParentWidthArg // parent width (if variable start index)
+  };
+  Type *ElTy = cast<VectorType>(Args[0]->getType())->getElementType();
+  Type *RegionTy;
+  if (NumElements != 1 || !AllowScalar)
+    RegionTy = VectorType::get(ElTy, NumElements);
+  else
+    RegionTy = ElTy;
+  Module *M = InsertBefore->getParent()->getParent()->getParent();
+  auto IID = ElTy->isFloatingPointTy()
+      ? GenXIntrinsic::genx_rdregionf : GenXIntrinsic::genx_rdregioni;
+  Function *Decl = getGenXRegionDeclaration(M, IID, RegionTy, Args);
+  Instruction *NewInst = CallInst::Create(Decl, Args, Name, InsertBefore);
+  NewInst->setDebugLoc(DL);
+  return NewInst;
+}
+
+/***********************************************************************
+ * CMRegion::createWrRegion : create wrregion instruction for subregion
+ * CMRegion::createWrConstRegion : create wrconstregion instruction for subregion
+ *
+ * Enter:   OldVal = vector value to insert subregion into (can be undef)
+ *          Input = subregion value to insert (can be scalar, as long as
+ *                  region size is 1)
+ *          Name = name for new instruction
+ *          InsertBefore = insert new inst before this point
+ *          DL = DebugLoc to give any new instruction
+ *
+ * Return:  The new wrregion instruction. However, if it would have had a
+ *          predication mask of all 0s, it is omitted and OldVal is returned
+ *          instead.
+ */
+Value *CMRegion::createWrRegion(Value *OldVal, Value *Input,
+    const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL)
+{
+  return createWrCommonRegion(OldVal->getType()->isFPOrFPVectorTy()
+        ? GenXIntrinsic::genx_wrregionf : GenXIntrinsic::genx_wrregioni,
+      OldVal, Input,
+      Name, InsertBefore, DL);
+}
+
+Value *CMRegion::createWrConstRegion(Value *OldVal, Value *Input,
+    const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL)
+{
+  assert(!Indirect);
+  assert(!Mask);
+  assert(isa<Constant>(Input));
+  return createWrCommonRegion(GenXIntrinsic::genx_wrconstregion, OldVal, Input,
+      Name, InsertBefore, DL);
+}
+
+Value *CMRegion::createWrCommonRegion(GenXIntrinsic::ID IID, Value *OldVal, Value *Input,
+    const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL)
+{
+  assert(ElementBytes && "not expecting i1 element type");
+  if (isa<VectorType>(Input->getType()))
+    assert(NumElements == Input->getType()->getVectorNumElements() &&
+           "input value and region are inconsistent");
+  else
+    assert(NumElements == 1 && "input value and region are inconsistent");
+  assert(OldVal->getType()->getScalarType() ==
+             Input->getType()->getScalarType() &&
+         "scalar type mismatch");
+  Value *StartIdx = getStartIdx(Name, InsertBefore, DL);
+  IntegerType *I32Ty = Type::getInt32Ty(Input->getContext());
+  Value *ParentWidthArg = UndefValue::get(I32Ty);
+  if (Indirect)
+    ParentWidthArg = ConstantInt::get(I32Ty, ParentWidth);
+  // Get the mask value. If R.Mask is 0, then the wrregion is unpredicated
+  // and we just use constant 1.
+  Value *MaskArg = Mask;
+  if (!MaskArg)
+    MaskArg = ConstantInt::get(Type::getInt1Ty(Input->getContext()), 1);
+  // Build the wrregion.
+  Value *Args[] = {   // Args to new wrregion:
+      OldVal, // original vector
+      Input, // value to write into subregion
+      ConstantInt::get(I32Ty, VStride), // vstride
+      ConstantInt::get(I32Ty, Width), // width
+      ConstantInt::get(I32Ty, Stride), // stride
+      StartIdx, // start index (in bytes)
+      ParentWidthArg, // parent width (if variable start index)
+      MaskArg // mask
+  };
+  Module *M = InsertBefore->getParent()->getParent()->getParent();
+  Function *Decl = getGenXRegionDeclaration(M, IID, nullptr, Args);
+  Instruction *NewInst = CallInst::Create(Decl, Args, Name, InsertBefore);
+  NewInst->setDebugLoc(DL);
+  return NewInst;
+}
+
+/***********************************************************************
+ * CMRegion::createRdPredRegion : create rdpredregion instruction
+ * CMRegion::createRdPredRegionOrConst : create rdpredregion instruction, or
+ *      simplify to constant
+ *
+ * Enter:   Input = vector value to extract subregion from
+ *          Index = start index of subregion
+ *          Size = size of subregion
+ *          Name = name for new instruction
+ *          InsertBefore = insert new inst before this point
+ *          DL = DebugLoc to give any new instruction
+ *
+ * Return:  The new rdpredregion instruction
+ *
+ * Unlike createRdRegion, this is a static method in Region, because you pass
+ * the region parameters (the start index and size) directly into this method.
+ */
+Instruction *CMRegion::createRdPredRegion(Value *Input, unsigned Index,
+    unsigned Size, const Twine &Name, Instruction *InsertBefore,
+    const DebugLoc &DL)
+{
+  Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext());
+  Value *Args[] = { // Args to new rdpredregion call:
+    Input, // input predicate
+    ConstantInt::get(I32Ty, Index) // start offset
+  };
+  auto RetTy = VectorType::get(Args[0]->getType()->getScalarType(), Size);
+  Module *M = InsertBefore->getParent()->getParent()->getParent();
+  Function *Decl = getGenXRegionDeclaration(M, GenXIntrinsic::genx_rdpredregion,
+      RetTy, Args);
+  Instruction *NewInst = CallInst::Create(Decl, Args, Name, InsertBefore);
+  NewInst->setDebugLoc(DL);
+  if (NewInst->getName() == "phitmp18.i.i.split0")
+    dbgs() << "wobble\n";
+  return NewInst;
+}
+
+/***********************************************************************
+* GetConstantSubvector : get a contiguous region from a vector constant
+*/
+static Constant *GetConstantSubvector(Constant *V,
+  unsigned StartIdx, unsigned Size)
+{
+  Type *ElTy = cast<VectorType>(V->getType())->getElementType();
+  Type *RegionTy = VectorType::get(ElTy, Size);
+  if (isa<UndefValue>(V))
+    V = UndefValue::get(RegionTy);
+  else if (isa<ConstantAggregateZero>(V))
+    V = ConstantAggregateZero::get(RegionTy);
+  else {
+    SmallVector<Constant *, 32> Val;
+    for (unsigned i = 0; i != Size; ++i)
+      Val.push_back(V->getAggregateElement(i + StartIdx));
+    V = ConstantVector::get(Val);
+  }
+  return V;
+}
+
+Value *CMRegion::createRdPredRegionOrConst(Value *Input, unsigned Index,
+    unsigned Size, const Twine &Name, Instruction *InsertBefore,
+    const DebugLoc &DL)
+{
+  if (auto C = dyn_cast<Constant>(Input))
+    return GetConstantSubvector(C, Index, Size);
+  return createRdPredRegion(Input, Index, Size, Name, InsertBefore, DL);
+}
+
+/***********************************************************************
+ * CMRegion::createWrPredRegion : create wrpredregion instruction
+ *
+ * Enter:   OldVal = vector value to insert subregion into (can be undef)
+ *          Input = subregion value to insert
+ *          Index = start index of subregion
+ *          Name = name for new instruction
+ *          InsertBefore = insert new inst before this point
+ *          DL = DebugLoc to give any new instruction
+ *
+ * Return:  The new wrpredregion instruction
+ *
+ * Unlike createWrRegion, this is a static method in Region, because you pass
+ * the only region parameter (the start index) directly into this method.
+ */
+Instruction *CMRegion::createWrPredRegion(Value *OldVal, Value *Input,
+    unsigned Index, const Twine &Name, Instruction *InsertBefore,
+    const DebugLoc &DL)
+{
+  IntegerType *I32Ty = Type::getInt32Ty(Input->getContext());
+  Value *Args[] = {   // Args to new wrpredregion:
+      OldVal, // original vector
+      Input, // value to write into subregion
+      ConstantInt::get(I32Ty, Index), // start index
+  };
+  Module *M = InsertBefore->getParent()->getParent()->getParent();
+  Function *Decl = getGenXRegionDeclaration(M, GenXIntrinsic::genx_wrpredregion,
+      nullptr, Args);
+  Instruction *NewInst = CallInst::Create(Decl, Args, Name, InsertBefore);
+  NewInst->setDebugLoc(DL);
+  return NewInst;
+}
+
+/***********************************************************************
+ * CMRegion::createWrPredPredRegion : create wrpredpredregion instruction
+ *
+ * Enter:   OldVal = vector value to insert subregion into (can be undef)
+ *          Input = subregion value to insert
+ *          Index = start index of subregion
+ *          Pred = predicate for the write region
+ *          Name = name for new instruction
+ *          InsertBefore = insert new inst before this point
+ *          DL = DebugLoc to give any new instruction
+ *
+ * Return:  The new wrpredpredregion instruction
+ *
+ * Unlike createWrRegion, this is a static method in Region, because you pass
+ * the only region parameter (the start index) directly into this method.
+ */
+Instruction *CMRegion::createWrPredPredRegion(Value *OldVal, Value *Input,
+    unsigned Index, Value *Pred, const Twine &Name, Instruction *InsertBefore,
+    const DebugLoc &DL)
+{
+  Type *Tys[] = { OldVal->getType(), Input->getType() };
+  Function *CalledFunc = GenXIntrinsic::getGenXDeclaration(
+      InsertBefore->getParent()->getParent()->getParent(),
+      GenXIntrinsic::genx_wrpredpredregion, Tys);
+  Value *Args[] = { OldVal, Input, 
+      ConstantInt::get(Type::getInt32Ty(InsertBefore->getContext()), Index),
+      Pred };
+  auto NewInst = CallInst::Create(CalledFunc, Args, "", InsertBefore);
+  NewInst->setDebugLoc(DL);
+  return NewInst;
+}
+
+/***********************************************************************
+ * setRegionCalledFunc : for an existing rdregion/wrregion call, modify
+ *      its called function to match its operand types
+ *
+ * This is used in GenXLegalization after modifying a wrregion operand
+ * such that its type changes. The called function then needs to change
+ * because it is decorated with overloaded types.
+ */
+void CMRegion::setRegionCalledFunc(Instruction *Inst)
+{
+  auto CI = cast<CallInst>(Inst);
+  SmallVector<Value *, 8> Opnds;
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+    Opnds.push_back(CI->getOperand(i));
+  Function *Decl = getGenXRegionDeclaration(
+      Inst->getParent()->getParent()->getParent(),
+	  GenXIntrinsic::getGenXIntrinsicID(Inst),
+      Inst->getType(), Opnds);
+  CI->setOperand(CI->getNumArgOperands(), Decl);
+}
+
+/***********************************************************************
+ * getRegionDeclaration : get the function declaration for a region intrinsic
+ *
+ * Enter:   M = Module
+ *          IID = intrinsic ID
+ *          RetTy = return type (can be 0 if return type not overloaded)
+ *          Args = array of operands so we can determine overloaded types
+ *
+ * Return:  the Function
+ */
+Function *CMRegion::getGenXRegionDeclaration(Module *M,
+    GenXIntrinsic::ID IID, Type *RetTy, ArrayRef<Value *> Args)
+{
+  switch (IID) {
+    case GenXIntrinsic::genx_rdregioni:
+    case GenXIntrinsic::genx_rdregionf: {
+      Type *Tys[] = { RetTy, Args[0]->getType(), Args[4]->getType() };
+      return GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+    }
+    case GenXIntrinsic::genx_wrregioni:
+    case GenXIntrinsic::genx_wrregionf:
+    case GenXIntrinsic::genx_wrconstregion: {
+      Type *Tys[] = { Args[0]->getType(), Args[1]->getType(),
+          Args[5]->getType(), Args[7]->getType() };
+      return GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+    }
+    case GenXIntrinsic::genx_rdpredregion: {
+      Type *Tys[] = { RetTy, Args[0]->getType() };
+      return GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+    }
+    case GenXIntrinsic::genx_wrpredregion: {
+      Type *Tys[] = { Args[0]->getType(), Args[1]->getType() };
+      return GenXIntrinsic::getGenXDeclaration(M, IID, Tys);
+    }
+    default:
+      llvm_unreachable("unrecognized region intrinsic ID");
+  }
+  return nullptr;
+}
+
+/***********************************************************************
+ * getStartIdx : get the LLVM IR Value for the start index of a region
+ *
+ * This is common code used by both createRdRegion and createWrRegion.
+ */
+Value *CMRegion::getStartIdx(const Twine &Name, Instruction *InsertBefore,
+    const DebugLoc &DL)
+{
+  IntegerType *I16Ty = Type::getInt16Ty(InsertBefore->getContext());
+  if (!Indirect)
+    return ConstantInt::get(I16Ty, Offset);
+  // Deal with indirect (variable index) region.
+  if (auto VT = dyn_cast<VectorType>(Indirect->getType())) {
+    if (VT->getNumElements() != NumElements) {
+      // We have a vector indirect and we need to take a subregion of it.
+      CMRegion IdxRegion(Indirect);
+      IdxRegion.getSubregion(IndirectIdx, NumElements / Width);
+      Indirect = IdxRegion.createRdRegion(Indirect,
+          Name + ".multiindirect_idx_subregion", InsertBefore, DL);
+      IndirectIdx = 0;
+    }
+  }
+  Value *Index = Indirect;
+  if (Offset) {
+    Constant *OffsetVal = ConstantInt::get(I16Ty, Offset);
+    if (auto VT = dyn_cast<VectorType>(Indirect->getType()))
+      OffsetVal = ConstantVector::getSplat(VT->getNumElements(), OffsetVal);
+    auto BO = BinaryOperator::Create(Instruction::Add, Index, OffsetVal,
+        Name + ".indirect_idx_add", InsertBefore);
+    BO->setDebugLoc(DL);
+    Index = BO;
+  }
+  return Index;
+}
+
+/***********************************************************************
+ * isSimilar : compare two regions to see if they have the same region
+ *      parameters other than start offset, also allowing element type to
+ *      be different
+ */
+bool CMRegion::isSimilar(const CMRegion &R2) const
+{
+  if (ElementBytes == R2.ElementBytes)
+    return isStrictlySimilar(R2);
+  // Change the element type to match, so we can compare the regions.
+  CMRegion R = R2;
+  if (!R.changeElementType(ElementTy))
+    return false;
+  return isStrictlySimilar(R);
+}
+
+BitVector CMRegion::getAccessBitMap(int MinTrackingOffset) const {
+  // Construct bitmap for a single row
+  BitVector RowBitMap(getRowLength());
+  for (unsigned i = 0; i < Width; i++) {
+    RowBitMap <<= (Stride * ElementBytes);
+    RowBitMap.set(0, ElementBytes);
+  }
+  // Apply row bitmap to a whole region bitmap
+  // exactly NumRows times
+  BitVector BitMap(getLength());
+  unsigned NumRows = NumElements / Width;
+  if (NumRows != 1) {
+    for (unsigned i = 0; i < NumRows; i++) {
+      BitMap <<= (VStride * ElementBytes);
+      BitMap |= RowBitMap;
+    }
+  } else
+    BitMap = std::move(RowBitMap);
+  // Adjust mask according to min tracking
+  // offset for comparison
+  assert(Offset >= MinTrackingOffset);
+  unsigned Diff = Offset - MinTrackingOffset;
+  if (Diff) {
+    BitMap.resize(BitMap.size() + Diff);
+    BitMap <<= Diff;
+  }
+  return BitMap;
+}
+
+// overlap: Compare two regions to see whether they overlaps each other.
+bool CMRegion::overlap(const CMRegion &R2) const {
+  // To be conservative, if any of them is indirect, they overlaps.
+  if (Indirect || R2.Indirect)
+    return true;
+  // To be conservative, if different masks are used, they overlaps.
+  if (Mask != R2.Mask)
+    return true;
+  // Check offsets of regions for intersection
+  int MaxOffset = std::max(Offset, R2.Offset);
+  int MinEndOffset = std::min(Offset + getLength(), R2.Offset + R2.getLength());
+  if (MaxOffset > MinEndOffset)
+    return false;
+  // Check overlapping using bit masks
+  int MinOffset = std::min(Offset, R2.Offset);
+  BitVector Mask1 = getAccessBitMap(MinOffset);
+  BitVector Mask2 = R2.getAccessBitMap(MinOffset);
+  // If there are any common bits then these regions overlap
+  return Mask1.anyCommon(Mask2);
+}
+
+/***********************************************************************
+ * CMRegion::isContiguous : test whether a region is contiguous
+ */
+bool CMRegion::isContiguous() const {
+  return (Width == 1 || Stride == 1) &&
+         (Width == NumElements || VStride == static_cast<int>(Width));
+}
+
+/***********************************************************************
+ * CMRegion::isWhole : test whether a region covers exactly the whole of the
+ *      given type, allowing for the element type being different
+ */
+bool CMRegion::isWhole(Type *Ty) const
+{
+  return isContiguous() && NumElements * ElementBytes * 8
+      == Ty->getPrimitiveSizeInBits();
+}
+
+/***********************************************************************
+ * evaluateConstantRdRegion : evaluate rdregion with constant input
+ */
+Constant *CMRegion::evaluateConstantRdRegion(Constant *Input, bool AllowScalar)
+{
+  assert(!Indirect);
+  if (NumElements != 1)
+    AllowScalar = false;
+  if (Constant *SV = Input->getSplatValue()) {
+    if (AllowScalar)
+      return SV;
+    return ConstantVector::getSplat(NumElements, SV);
+  }
+  auto VT = cast<VectorType>(Input->getType());
+  SmallVector<Constant *, 8> Values;
+  Constant *Undef = UndefValue::get(AllowScalar
+      ? ElementTy : VectorType::get(ElementTy, NumElements));
+  if (isa<UndefValue>(Input))
+    return Undef;
+  unsigned RowIdx = Offset / ElementBytes;
+  unsigned Idx = RowIdx;
+  unsigned NextRow = Width;
+  for (unsigned i = 0; i != NumElements; ++i) {
+    if (i == NextRow) {
+      RowIdx += VStride;
+      Idx = RowIdx;
+    }
+    if (Idx >= VT->getNumElements())
+      return Undef; // out of range index
+    // Get the element value and push it into Values.
+    if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Input))
+      Values.push_back(CDV->getElementAsConstant(Idx));
+    else {
+      auto CV = cast<ConstantVector>(Input);
+      Values.push_back(CV->getOperand(Idx));
+    }
+    Idx += Stride;
+  }
+  if (AllowScalar)
+    return Values[0];
+  return ConstantVector::get(Values);
+}
+
+/***********************************************************************
+ * evaluateConstantWrRegion : evaluate wrregion with constant inputs
+ */
+Constant *CMRegion::evaluateConstantWrRegion(Constant *OldVal, Constant *NewVal)
+{
+  assert(!Indirect);
+  SmallVector<Constant *, 8> Vec;
+  for (unsigned i = 0, e = OldVal->getType()->getVectorNumElements();
+      i != e; ++i)
+    Vec.push_back(OldVal->getAggregateElement(i));
+  unsigned Off = Offset / ElementBytes, Row = Off;
+  auto NewVT = dyn_cast<VectorType>(NewVal->getType());
+  unsigned NewNumEls = !NewVT ? 1 : NewVT->getNumElements();
+  for (unsigned i = 0;;) {
+    if (Off >= Vec.size())
+      return UndefValue::get(OldVal->getType()); // out of range
+    Vec[Off] = !NewVT ? NewVal : NewVal->getAggregateElement(i);
+    if (++i == NewNumEls)
+      break;
+    if (i % Width) {
+      Off += Stride;
+      continue;
+    }
+    Row += VStride;
+    Off = Row;
+  }
+  return ConstantVector::get(Vec);
+}
+
+/***********************************************************************
+ * CMRegion::changeElementType : change element type of the region
+ *
+ * Return:  true if succeeded, false if failed (nothing altered)
+ */
+bool CMRegion::changeElementType(Type *NewElementType)
+{
+  assert(Offset % ElementBytes == 0 && "Impossible offset (in bytes) for data type");
+  unsigned NewElementBytes = NewElementType->getPrimitiveSizeInBits() / 8U;
+  if (NewElementType->getPrimitiveSizeInBits())
+    NewElementBytes = NewElementBytes ? NewElementBytes : 1;
+  if (NewElementBytes == ElementBytes) {
+    // No change in element size
+    ElementTy = NewElementType;
+    return true;
+  }
+  int Ratio = NewElementBytes/ElementBytes;
+  if (Ratio >= 1) {
+    // Trying to make the element size bigger.
+    if (Width & ((1 * Ratio) - 1))
+      return false; // width misaligned
+    if (Stride != 1)
+      return false; // rows not contiguous
+    if (Offset % NewElementBytes != 0)
+      return false;
+    NumElements = NumElements / Ratio;
+    Width = Width / Ratio;
+    VStride = VStride / Ratio;
+    if (Width == 1) {
+      // Width is now 1, so turn it into a 1D region.
+      Stride = VStride;
+      VStride = 0;
+      Width = NumElements;
+    }
+    ElementTy = NewElementType;
+    ElementBytes = NewElementBytes;
+    return true;
+  }
+  // Trying to make the element size smaller.
+  Ratio = ElementBytes / NewElementBytes;;
+  if (Stride == 1 || Width == 1) {
+    // Row contiguous.
+    Stride = 1;
+    NumElements *= Ratio;
+    Width *= Ratio;
+    VStride *= Ratio;
+    ElementTy = NewElementType;
+    ElementBytes = NewElementBytes;
+    return true;
+  }
+  if (!is2D()) {
+    // 1D and not contiguous. Turn it into a 2D region.
+    VStride = Stride * Ratio;
+    Stride = 1;
+    Width = 1 * Ratio;
+    NumElements *= Ratio;
+    ElementTy = NewElementType;
+    ElementBytes = NewElementBytes;
+    return true;
+  }
+  return false;
+}
+
+/***********************************************************************
+ * CMRegion::append : append region AR to this region
+ *
+ * Return:  true if succeeded (this region modified)
+ *          false if not possible to append (this region in indeterminate state)
+ *
+ * This succeeds even if it leaves this region in an illegal state where
+ * it has a non-integral number of rows. After doing a sequence of appends,
+ * the caller needs to check that the resulting region is legal by calling
+ * isWholeNumRows().
+ */
+bool CMRegion::append(CMRegion AR)
+{
+  assert(AR.isWholeNumRows());
+  if (Indirect != AR.Indirect)
+    return false;
+  unsigned ARNumRows = AR.NumElements / AR.Width;
+  // Consider each row of AR separately.
+  for (unsigned ARRow = 0; ARRow != ARNumRows;
+      ++ARRow, AR.Offset += AR.VStride * AR.ElementBytes) {
+    if (NumElements == Width) {
+      // This region is currently 1D.
+      if (NumElements == 1)
+        Stride = (AR.Offset - Offset) / ElementBytes;
+      else if (AR.Width != 1 && Stride != AR.Stride)
+        return false; // Mismatched stride.
+      int NextOffset = Offset + Width * Stride * ElementBytes;
+      if (AR.Offset == NextOffset) {
+        // AR is a continuation of the same single row.
+        Width += AR.Width;
+        NumElements = Width;
+        continue;
+      }
+      // AR is the start (or whole) of a second row.
+      if (AR.Width > Width)
+        return false; // AR row is bigger than this row.
+      VStride = (AR.Offset - Offset) / ElementBytes;
+      NumElements += AR.Width;
+      continue;
+    }
+    // This region is already 2D.
+    unsigned ExtraBit = NumElements % Width;
+    int NextOffset = Offset + ((VStride * (NumElements / Width))
+        + ExtraBit) * ElementBytes;
+    if (NextOffset != AR.Offset)
+      return false; // Mismatched next offset.
+    if (AR.Width > Width - ExtraBit)
+      return false; // Too much to fill whole row, or remainder of row after
+                    //   existing extra bit.
+    if (AR.Width != 1 && AR.Stride != Stride)
+      return false; // Mismatched stride.
+    NumElements += AR.Width;
+  }
+  return true;
+}
+
+/***********************************************************************
+ * Region debug dump/print
+ */
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void CMRegion::dump() const
+{
+  errs() << *this << "\n";
+}
+#endif
+
+void CMRegion::print(raw_ostream &OS) const
+{
+  OS << *VectorType::get(ElementTy, NumElements) << " <"
+      << VStride << ";" << Width << "," << Stride << ">(";
+  if (Indirect) {
+    OS << Indirect->getName();
+    if (auto VT = dyn_cast<VectorType>(Indirect->getType()))
+      OS << "<" << VT->getNumElements() << ">(" << IndirectIdx << ")";
+    OS << " + ";
+  }
+  OS << Offset << ")";
+  if (Indirect && ParentWidth)
+    OS << " {parentwidth=" << ParentWidth << "}";
+  if (Mask)
+    OS << " {mask=" << *Mask << "}";
+}
+
diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMakeLists.txt b/IGC/VectorCompiler/lib/GenXOpts/CMakeLists.txt
new file mode 100644
index 000000000000..9e99bece3df0
--- /dev/null
+++ b/IGC/VectorCompiler/lib/GenXOpts/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(OPT_SOURCES
+  CMAnalysis/InstructionSimplifyGenX.cpp
+  CMAnalysis/ConstantFoldingGenX.cpp
+  CMTrans/CMABI.cpp
+  CMTrans/CMImpParam.cpp
+  CMTrans/CMKernelArgOffset.cpp
+  CMTrans/CMRegion.cpp
+  CMPacketize/GenXPacketize.cpp
+  CMPacketize/PacketBuilder.cpp
+  CMPacketize/PacketBuilder_math.cpp
+  CMPacketize/PacketBuilder_mem.cpp
+  CMPacketize/PacketBuilder_misc.cpp
+  CMPacketize/WIAnalysis.cpp
+)
+
+add_library(VCTransforms ${OPT_SOURCES})
+target_link_libraries(VCTransforms
+  VCHeaders
+  LLVMGenXIntrinsics
+  )
diff --git a/IGC/VectorCompiler/lib/Support/CMakeLists.txt b/IGC/VectorCompiler/lib/Support/CMakeLists.txt
new file mode 100644
index 000000000000..6ec90da356db
--- /dev/null
+++ b/IGC/VectorCompiler/lib/Support/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(SUPPORT_SOURCES
+  Status.cpp
+  Options.cpp
+  )
+
+add_library(VCSupport ${SUPPORT_SOURCES})
+target_link_libraries(VCSupport
+  VCHeaders
+  LLVMSupport
+  LLVMOption
+  )
diff --git a/IGC/VectorCompiler/lib/Support/Options.cpp b/IGC/VectorCompiler/lib/Support/Options.cpp
new file mode 100644
index 000000000000..a342d26e9a86
--- /dev/null
+++ b/IGC/VectorCompiler/lib/Support/Options.cpp
@@ -0,0 +1,62 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "vc/Support/Options.h"
+
+#include <llvm/Option/Option.h>
+
+using namespace vc::options;
+using namespace llvm::opt;
+
+#define PREFIX(NAME, VALUE) static const char *const NAME[] = VALUE;
+#include "vc/Support/Options.inc"
+#undef PREFIX
+
+static const OptTable::Info InfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {PREFIX, NAME,  HELPTEXT,    METAVAR,     OPT_##ID,  Option::KIND##Class,    \
+   PARAM,  FLAGS, OPT_##GROUP, OPT_##ALIAS, ALIASARGS, VALUES},
+#include "vc/Support/Options.inc"
+#undef OPTION
+};
+
+namespace {
+class VCOptTable : public OptTable {
+public:
+  VCOptTable() : OptTable(InfoTable) {
+    OptTable &Opt = *this;
+    (void)Opt;
+#define OPTTABLE_ARG_INIT
+#include "vc/Support/Options.inc"
+#undef OPTTABLE_ARG_INIT
+  }
+};
+} // namespace
+
+static const VCOptTable OptionsTable;
+
+const OptTable &vc::getOptTable() { return OptionsTable; }
diff --git a/IGC/VectorCompiler/lib/Support/Status.cpp b/IGC/VectorCompiler/lib/Support/Status.cpp
new file mode 100644
index 000000000000..94c5d3a15e40
--- /dev/null
+++ b/IGC/VectorCompiler/lib/Support/Status.cpp
@@ -0,0 +1,150 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "vc/Support/Status.h"
+
+#include "vc/Support/StatusCode.h"
+#include "vc/Support/StatusTraits.h"
+
+#include "llvm/Support/ErrorHandling.h"
+
+#include <string>
+#include <system_error>
+
+namespace {
+class vc_error_category : public std::error_category {
+public:
+  const char *name() const noexcept override;
+  std::string message(int condition) const override;
+};
+} // namespace
+
+const char *vc_error_category::name() const noexcept {
+  return "vector compiler";
+}
+
+std::string vc_error_category::message(int condition) const {
+  using namespace vc;
+
+  switch (static_cast<errc>(condition)) {
+  case errc::dynamic_load_fail:
+    return ErrorTraits<errc::dynamic_load_fail>::getMessage();
+  case errc::symbol_not_found:
+    return ErrorTraits<errc::symbol_not_found>::getMessage();
+  case errc::bad_spirv:
+    return ErrorTraits<errc::bad_spirv>::getMessage();
+  case errc::bad_bitcode:
+    return ErrorTraits<errc::bad_bitcode>::getMessage();
+  case errc::invalid_module:
+    return ErrorTraits<errc::invalid_module>::getMessage();
+  case errc::target_machine_not_created:
+    return ErrorTraits<errc::target_machine_not_created>::getMessage();
+  case errc::not_vc_codegen:
+    return ErrorTraits<errc::not_vc_codegen>::getMessage();
+  case errc::invalid_api_option:
+    return ErrorTraits<errc::invalid_api_option>::getMessage();
+  case errc::invalid_internal_option:
+    return ErrorTraits<errc::invalid_internal_option>::getMessage();
+  }
+  llvm_unreachable("Unknown error code");
+}
+
+static vc_error_category vc_err_category;
+
+namespace vc {
+
+const std::error_category &err_category() noexcept { return vc_err_category; }
+
+// DynLoadError {{
+char DynLoadError::ID = 0;
+
+void DynLoadError::log(llvm::raw_ostream &OS) const {
+  OS << ErrorTraits<errc::dynamic_load_fail>::getMessage() << ": " << Message;
+}
+// }}
+
+// SymbolLookupError {{
+char SymbolLookupError::ID = 0;
+
+void SymbolLookupError::log(llvm::raw_ostream &OS) const {
+  OS << ErrorTraits<errc::symbol_not_found>::getMessage() << ": symbol '"
+     << Symbol << "' was not found in '" << Library << "'";
+}
+// }}
+
+// BadSpirvError {{
+char BadSpirvError::ID = 0;
+
+void BadSpirvError::log(llvm::raw_ostream &OS) const {
+  OS << ErrorTraits<errc::bad_spirv>::getMessage() << ": " << Message;
+}
+// }}
+
+// BadBitcodeError {{
+char BadBitcodeError::ID = 0;
+
+void BadBitcodeError::log(llvm::raw_ostream &OS) const {
+  OS << ErrorTraits<errc::bad_bitcode>::getMessage() << ": " << Message;
+}
+// }}
+
+// InvalidModuleError {{
+char InvalidModuleError::ID = 0;
+
+void InvalidModuleError::log(llvm::raw_ostream &OS) const {
+  OS << ErrorTraits<errc::invalid_module>::getMessage();
+}
+// }}
+
+// TargetMachineError {{
+char TargetMachineError::ID = 0;
+
+void TargetMachineError::log(llvm::raw_ostream &OS) const {
+  OS << ErrorTraits<errc::target_machine_not_created>::getMessage();
+}
+// }}
+
+// NotVCError {{
+char NotVCError::ID = 0;
+
+void NotVCError::log(llvm::raw_ostream &OS) const {
+  OS << ErrorTraits<errc::not_vc_codegen>::getMessage();
+}
+// }}
+
+// OptionErrorCommon {{
+char OptionError::ID = 0;
+
+void OptionError::log(llvm::raw_ostream &OS) const {
+  if (IsInternal)
+    OS << ErrorTraits<errc::invalid_internal_option>::getMessage();
+  else
+    OS << ErrorTraits<errc::invalid_api_option>::getMessage();
+  OS << ": " << BadOption;
+}
+// }}
+
+} // namespace vc
diff --git a/IGC/VectorCompiler/spirv-patches-new/0001-Add-common-OCL-address-spaces-for-VectorCompute-glob.patch b/IGC/VectorCompiler/spirv-patches-new/0001-Add-common-OCL-address-spaces-for-VectorCompute-glob.patch
new file mode 100644
index 000000000000..80fab282a3a7
--- /dev/null
+++ b/IGC/VectorCompiler/spirv-patches-new/0001-Add-common-OCL-address-spaces-for-VectorCompute-glob.patch
@@ -0,0 +1,40 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: nrudenko <nikita.rudenko@intel.com>
+Date: Wed, 24 Jun 2020 16:58:30 +0300
+Subject: [PATCH 1/4] Add common OCL address spaces for VectorCompute globals
+ This commit allows to use UniformConstant and CrossWorkgroup storage classes
+ for VectorCompute globals
+
+---
+ lib/SPIRV/VectorComputeUtil.cpp | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/lib/SPIRV/VectorComputeUtil.cpp b/lib/SPIRV/VectorComputeUtil.cpp
+index 2c68aa5..26d424f 100755
+--- a/lib/SPIRV/VectorComputeUtil.cpp
++++ b/lib/SPIRV/VectorComputeUtil.cpp
+@@ -120,6 +120,10 @@ getVCGlobalVarStorageClass(SPIRAddressSpace AddressSpace) noexcept {
+     return StorageClassPrivate;
+   case SPIRAS_Local:
+     return StorageClassWorkgroup;
++  case SPIRAS_Global:
++    return StorageClassCrossWorkgroup;
++  case SPIRAS_Constant:
++    return StorageClassUniformConstant;
+   default:
+     assert(false && "Unexpected address space");
+     return StorageClassPrivate;
+@@ -133,6 +137,10 @@ getVCGlobalVarAddressSpace(SPIRVStorageClassKind StorageClass) noexcept {
+     return SPIRAS_Private;
+   case StorageClassWorkgroup:
+     return SPIRAS_Local;
++  case StorageClassCrossWorkgroup:
++    return SPIRAS_Global;
++  case StorageClassUniformConstant:
++    return SPIRAS_Constant;
+   default:
+     assert(false && "Unexpected storage class");
+     return SPIRAS_Private;
+-- 
+2.17.1
+
diff --git a/IGC/VectorCompiler/spirv-patches-new/0002-Add-DecorationFuncParamKindINTEL-and-DecorationFuncP.patch b/IGC/VectorCompiler/spirv-patches-new/0002-Add-DecorationFuncParamKindINTEL-and-DecorationFuncP.patch
new file mode 100644
index 000000000000..9998533a22c4
--- /dev/null
+++ b/IGC/VectorCompiler/spirv-patches-new/0002-Add-DecorationFuncParamKindINTEL-and-DecorationFuncP.patch
@@ -0,0 +1,198 @@
+From ac92d249ea1177c7bee2b2aa7861267f775ff0db Mon Sep 17 00:00:00 2001
+From: nrudenko <nikita.rudenko@intel.com>
+Date: Thu, 4 Jun 2020 16:34:15 +0300
+Subject: [PATCH 2/4] Add DecorationFuncParamKindINTEL and
+ DecorationFuncParamDescINTEL
+
+Change-Id: Ic90237386532736588c558c7479370e48af7ce87
+---
+ lib/SPIRV/SPIRVReader.cpp             | 12 ++++++++++++
+ lib/SPIRV/SPIRVWriter.cpp             | 13 +++++++++++++
+ lib/SPIRV/VectorComputeUtil.h         |  2 ++
+ lib/SPIRV/libSPIRV/SPIRVDecorate.cpp  | 12 ++++++++++++
+ lib/SPIRV/libSPIRV/SPIRVDecorate.h    |  9 +++++++++
+ lib/SPIRV/libSPIRV/SPIRVEnum.h        |  2 ++
+ lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h |  2 ++
+ lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h |  2 ++
+ lib/SPIRV/libSPIRV/spirv.hpp          |  2 ++
+ 9 files changed, 56 insertions(+)
+
+diff --git a/lib/SPIRV/SPIRVReader.cpp b/lib/SPIRV/SPIRVReader.cpp
+index 210c741..8dc5784 100644
+--- a/lib/SPIRV/SPIRVReader.cpp
++++ b/lib/SPIRV/SPIRVReader.cpp
+@@ -3362,6 +3362,18 @@ bool SPIRVToLLVM::transVectorComputeMetadata(SPIRVFunction *BF) {
+                                       std::to_string(Kind));
+       F->addAttribute(ArgNo + 1, Attr);
+     }
++    if (BA->hasDecorate(DecorationFuncParamKindINTEL, 0, &Kind)) {
++      Attribute Attr = Attribute::get(*Context, kVCMetadata::VCArgumentKind,
++                                      std::to_string(Kind));
++      F->addAttribute(ArgNo + 1, Attr);
++    }
++    if (BA->hasDecorate(DecorationFuncParamDescINTEL)) {
++      auto Desc =
++          BA->getDecorationStringLiteral(DecorationFuncParamDescINTEL).front();
++      Attribute Attr =
++          Attribute::get(*Context, kVCMetadata::VCArgumentDesc, Desc);
++      F->addAttribute(ArgNo + 1, Attr);
++    }
+   }
+ 
+   // Do not add float control if there is no any
+diff --git a/lib/SPIRV/SPIRVWriter.cpp b/lib/SPIRV/SPIRVWriter.cpp
+index 3f569ff..670ba1a 100644
+--- a/lib/SPIRV/SPIRVWriter.cpp
++++ b/lib/SPIRV/SPIRVWriter.cpp
+@@ -567,6 +567,19 @@ void LLVMToSPIRV::transVectorComputeMetadata(Function *F) {
+           .getAsInteger(0, Kind);
+       BA->addDecorate(DecorationFuncParamIOKind, Kind);
+     }
++    if (Attrs.hasAttribute(ArgNo + 1, kVCMetadata::VCArgumentKind)) {
++      SPIRVWord Kind;
++      Attrs.getAttribute(ArgNo + 1, kVCMetadata::VCArgumentKind)
++          .getValueAsString()
++          .getAsInteger(0, Kind);
++      BA->addDecorate(DecorationFuncParamKindINTEL, Kind);
++    }
++    if (Attrs.hasAttribute(ArgNo + 1, kVCMetadata::VCArgumentDesc)) {
++      StringRef Desc =
++          Attrs.getAttribute(ArgNo + 1, kVCMetadata::VCArgumentDesc)
++              .getValueAsString();
++      BA->addDecorate(new SPIRVDecorateFuncParamDescAttr(BA, Desc.str()));
++    }
+   }
+ }
+ 
+diff --git a/lib/SPIRV/VectorComputeUtil.h b/lib/SPIRV/VectorComputeUtil.h
+index 08d2129..f215b2d 100755
+--- a/lib/SPIRV/VectorComputeUtil.h
++++ b/lib/SPIRV/VectorComputeUtil.h
+@@ -116,6 +116,8 @@ const static char VCSLMSize[] = "VCSLMSize";
+ const static char VCGlobalVariable[] = "VCGlobalVariable";
+ const static char VCVolatile[] = "VCVolatile";
+ const static char VCByteOffset[] = "VCByteOffset";
++const static char VCArgumentKind[] = "VCArgumentKind";
++const static char VCArgumentDesc[] = "VCArgumentDesc";
+ } // namespace kVCMetadata
+ 
+ ///////////////////////////////////////////////////////////////////////////////
+diff --git a/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp b/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp
+index 4af7e2c..3e8d4fe 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp
++++ b/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp
+@@ -104,6 +104,9 @@ void SPIRVDecorate::encode(spv_ostream &O) const {
+   case DecorationUserSemantic:
+     SPIRVDecorateUserSemanticAttr::encodeLiterals(Encoder, Literals);
+     break;
++  case DecorationFuncParamDescINTEL:
++    SPIRVDecorateFuncParamDescAttr::encodeLiterals(Encoder, Literals);
++    break;
+   default:
+     Encoder << Literals;
+   }
+@@ -130,6 +133,9 @@ void SPIRVDecorate::decode(std::istream &I) {
+   case DecorationUserSemantic:
+     SPIRVDecorateUserSemanticAttr::decodeLiterals(Decoder, Literals);
+     break;
++  case DecorationFuncParamDescINTEL:
++    SPIRVDecorateFuncParamDescAttr::decodeLiterals(Decoder, Literals);
++    break;
+   default:
+     Decoder >> Literals;
+   }
+@@ -149,6 +155,9 @@ void SPIRVMemberDecorate::encode(spv_ostream &O) const {
+   case DecorationUserSemantic:
+     SPIRVDecorateUserSemanticAttr::encodeLiterals(Encoder, Literals);
+     break;
++  case DecorationFuncParamDescINTEL:
++    SPIRVDecorateFuncParamDescAttr::encodeLiterals(Encoder, Literals);
++    break;
+   default:
+     Encoder << Literals;
+   }
+@@ -172,6 +181,9 @@ void SPIRVMemberDecorate::decode(std::istream &I) {
+   case DecorationUserSemantic:
+     SPIRVDecorateUserSemanticAttr::decodeLiterals(Decoder, Literals);
+     break;
++  case DecorationFuncParamDescINTEL:
++    SPIRVDecorateFuncParamDescAttr::decodeLiterals(Decoder, Literals);
++    break;
+   default:
+     Decoder >> Literals;
+   }
+diff --git a/lib/SPIRV/libSPIRV/SPIRVDecorate.h b/lib/SPIRV/libSPIRV/SPIRVDecorate.h
+index ea816ba..23eca12 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVDecorate.h
++++ b/lib/SPIRV/libSPIRV/SPIRVDecorate.h
+@@ -407,6 +407,15 @@ public:
+       : SPIRVDecorateStrAttrBase(TheTarget, AnnotateString) {}
+ };
+ 
++class SPIRVDecorateFuncParamDescAttr
++    : public SPIRVDecorateStrAttrBase<DecorationFuncParamDescINTEL> {
++public:
++  //  Complete constructor for UserSemantic decoration
++  SPIRVDecorateFuncParamDescAttr(SPIRVEntry *TheTarget,
++                                 const std::string &AnnotateString)
++      : SPIRVDecorateStrAttrBase(TheTarget, AnnotateString) {}
++};
++
+ class SPIRVDecorateMergeINTELAttr : public SPIRVDecorate {
+ public:
+   // Complete constructor for MergeINTEL decoration
+diff --git a/lib/SPIRV/libSPIRV/SPIRVEnum.h b/lib/SPIRV/libSPIRV/SPIRVEnum.h
+index 0b65093..c653016 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVEnum.h
++++ b/lib/SPIRV/libSPIRV/SPIRVEnum.h
+@@ -392,6 +392,8 @@ template <> inline void SPIRVMap<Decoration, SPIRVCapVec>::init() {
+                {CapabilityVectorComputeINTEL});
+   ADD_VEC_INIT(DecorationFuncParamIOKind, {CapabilityVectorComputeINTEL});
+   ADD_VEC_INIT(DecorationStackCallINTEL, {CapabilityVectorComputeINTEL});
++  ADD_VEC_INIT(DecorationFuncParamKindINTEL, {CapabilityVectorComputeINTEL});
++  ADD_VEC_INIT(DecorationFuncParamDescINTEL, {CapabilityVectorComputeINTEL});
+ }
+ 
+ template <> inline void SPIRVMap<BuiltIn, SPIRVCapVec>::init() {
+diff --git a/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+index 09b9b8a..78c7925 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
++++ b/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+@@ -422,6 +422,8 @@ inline bool isValid(spv::Decoration V) {
+   case DecorationReferencedIndirectlyINTEL:
+   case DecorationVectorComputeFunctionINTEL:
+   case DecorationStackCallINTEL:
++  case DecorationFuncParamKindINTEL:
++  case DecorationFuncParamDescINTEL:
+   case DecorationVectorComputeVariableINTEL:
+   case DecorationGlobalVariableOffsetINTEL:
+   case DecorationFuncParamIOKind:
+diff --git a/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+index 867c9c1..077b662 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
++++ b/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+@@ -361,6 +361,8 @@ template <> inline void SPIRVMap<Decoration, std::string>::init() {
+   add(DecorationReferencedIndirectlyINTEL, "ReferencedIndirectlyINTEL");
+   add(DecorationVectorComputeFunctionINTEL, "VectorComputeFunctionINTEL");
+   add(DecorationStackCallINTEL, "StackCallINTEL");
++  add(DecorationFuncParamKindINTEL, "FuncParamKindINTEL");
++  add(DecorationFuncParamDescINTEL, "FuncParamDescINTEL");
+   add(DecorationVectorComputeVariableINTEL, "VectorComputeVariableINTEL");
+   add(DecorationGlobalVariableOffsetINTEL, "GlobalVariableOffsetINTEL");
+   add(DecorationFuncParamIOKind, "FuncParamIOKind");
+diff --git a/lib/SPIRV/libSPIRV/spirv.hpp b/lib/SPIRV/libSPIRV/spirv.hpp
+index d0f5f75..bfc92ef 100644
+--- a/lib/SPIRV/libSPIRV/spirv.hpp
++++ b/lib/SPIRV/libSPIRV/spirv.hpp
+@@ -475,6 +475,8 @@ enum Decoration {
+   DecorationRestrictPointerEXT = 5355,
+   DecorationAliasedPointer = 5356,
+   DecorationAliasedPointerEXT = 5356,
++  DecorationFuncParamKindINTEL = 9624,
++  DecorationFuncParamDescINTEL = 9625,
+   DecorationReferencedIndirectlyINTEL = 5602,
+   DecorationSideEffectsINTEL = 5608,
+   DecorationVectorComputeVariableINTEL = 5624,
+-- 
+2.17.1
+
diff --git a/IGC/VectorCompiler/spirv-patches-new/0003-Add-SPIRVDLL-and-VCExport.patch b/IGC/VectorCompiler/spirv-patches-new/0003-Add-SPIRVDLL-and-VCExport.patch
new file mode 100644
index 000000000000..a3a466a90230
--- /dev/null
+++ b/IGC/VectorCompiler/spirv-patches-new/0003-Add-SPIRVDLL-and-VCExport.patch
@@ -0,0 +1,216 @@
+From b727c486069844db240b3dead3fe92064b840724 Mon Sep 17 00:00:00 2001
+From: nrudenko <nikita.rudenko@intel.com>
+Date: Thu, 4 Jun 2020 15:20:43 +0300
+Subject: [PATCH 3/4] Add SPIRVDLL and VCExport
+
+Change-Id: I8a541ad383b18fd1b14e75f431e034dc10db6817
+---
+ lib/SPIRV/CMakeLists.txt | 41 +++++++++++++++++-
+ lib/SPIRV/VCExport.cpp   | 89 ++++++++++++++++++++++++++++++++++++++++
+ lib/SPIRV/VCExport.h     | 28 +++++++++++++
+ 3 files changed, 157 insertions(+), 1 deletion(-)
+ mode change 100644 => 100755 lib/SPIRV/CMakeLists.txt
+ create mode 100755 lib/SPIRV/VCExport.cpp
+ create mode 100755 lib/SPIRV/VCExport.h
+
+diff --git a/lib/SPIRV/CMakeLists.txt b/lib/SPIRV/CMakeLists.txt
+old mode 100644
+new mode 100755
+index 92ba12a..5f5b072
+--- a/lib/SPIRV/CMakeLists.txt
++++ b/lib/SPIRV/CMakeLists.txt
+@@ -1,4 +1,5 @@
+-add_llvm_library(LLVMSPIRVLib
++set(SPIRV_SOURCES
++  VCExport.cpp
+   LLVMToSPIRVDbgTran.cpp
+   Mangler/FunctionDescriptor.cpp
+   Mangler/Mangler.cpp
+@@ -34,6 +35,10 @@ add_llvm_library(LLVMSPIRVLib
+   libSPIRV/SPIRVStream.cpp
+   libSPIRV/SPIRVType.cpp
+   libSPIRV/SPIRVValue.cpp
++)
++
++add_llvm_library(LLVMSPIRVLib
++  ${SPIRV_SOURCES}
+   LINK_COMPONENTS
+     Analysis
+     BitWriter
+@@ -44,6 +49,31 @@ add_llvm_library(LLVMSPIRVLib
+     intrinsics_gen
+ )
+ 
++# --- mock: add_llvm_library(SPIRVDLL MODULE ---
++# unfortunately this do not work for llvm build system as is so some magic below
++
++add_library(SPIRVDLL MODULE
++  ${SPIRV_SOURCES}
++)
++
++llvm_update_compile_flags(SPIRVDLL)
++
++add_dependencies(SPIRVDLL intrinsics_gen LLVMAnalysis LLVMBitWriter LLVMCore LLVMSupport LLVMTransformUtils)
++target_link_libraries(SPIRVDLL LLVMAnalysis LLVMBitWriter LLVMCore LLVMSupport LLVMTransformUtils)
++
++install(TARGETS SPIRVDLL
++        EXPORT LLVMExports
++        LIBRARY DESTINATION lib
++        COMPONENT SPIRVDLL)
++
++add_llvm_install_targets(install-SPIRVDLL
++                         DEPENDS SPIRVDLL
++                         COMPONENT SPIRVDLL)
++
++set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS SPIRVDLL)
++
++# --- end mock ---
++
+ target_include_directories(LLVMSPIRVLib
+   PRIVATE
+     ${LLVM_INCLUDE_DIRS}
+@@ -52,3 +82,12 @@ target_include_directories(LLVMSPIRVLib
+     ${CMAKE_CURRENT_SOURCE_DIR}/libSPIRV
+     ${CMAKE_CURRENT_SOURCE_DIR}/Mangler
+ )
++
++target_include_directories(SPIRVDLL
++  PRIVATE
++    ${LLVM_INCLUDE_DIRS}
++    ${LLVM_SPIRV_INCLUDE_DIRS}
++    ${CMAKE_CURRENT_SOURCE_DIR}
++    ${CMAKE_CURRENT_SOURCE_DIR}/libSPIRV
++    ${CMAKE_CURRENT_SOURCE_DIR}/Mangler
++)
+diff --git a/lib/SPIRV/VCExport.cpp b/lib/SPIRV/VCExport.cpp
+new file mode 100755
+index 0000000..e8893e1
+--- /dev/null
++++ b/lib/SPIRV/VCExport.cpp
+@@ -0,0 +1,89 @@
++//===- VCExport.cpp - dll interface for SPIRV implementation -*- C++ -*----===//
++//
++//                     The LLVM/SPIR-V Translator
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements dll interface of SPIRV translator
++//
++//===----------------------------------------------------------------------===//
++
++#include <algorithm>
++#include <iostream>
++#include <memory>
++#include <utility>
++
++#include "LLVMSPIRVLib.h"
++#include "SPIRVInternal.h"
++#include "VCExport.h"
++#include "llvm/Bitcode/BitcodeReader.h"
++#include "llvm/Bitcode/BitcodeWriter.h"
++#include "llvm/IR/Module.h"
++#include "llvm/IR/Verifier.h"
++#include "llvm/Support/MemoryBuffer.h"
++
++SPIRV::TranslatorOpts GetTranslatorOpts() {
++  std::map<std::string, ExtensionID> ExtensionNamesMap;
++#define _STRINGIFY(X) #X
++#define STRINGIFY(X) _STRINGIFY(X)
++#define EXT(X) ExtensionNamesMap[STRINGIFY(X)] = ExtensionID::X;
++#include "LLVMSPIRVExtensions.inc"
++#undef EXT
++#undef STRINGIFY
++#undef _STRINGIFY
++
++  SPIRV::TranslatorOpts::ExtensionsStatusMap ExtensionsStatus;
++  // Set the initial state:
++  //  - during SPIR-V consumption, assume that any known extension is allowed.
++  //  - during SPIR-V generation, assume that any known extension is disallowed.
++  //  - during conversion to/from SPIR-V text representation, assume that any
++  //    known extension is allowed.
++  for (const auto &It : ExtensionNamesMap)
++    ExtensionsStatus[It.second] = true;
++  SPIRV::TranslatorOpts Opts(VersionNumber::MaximumVersion, ExtensionsStatus);
++  Opts.setFPContractMode(SPIRV::FPContractMode::On);
++  Opts.setDesiredBIsRepresentation(SPIRV::BIsRepresentation::SPIRVFriendlyIR);
++  return Opts;
++}
++
++int spirv_read_verify_module(
++    const char *pIn, size_t InSz,
++    void (*OutSaver)(const char *pOut, size_t OutSize, void *OutUserData),
++    void *OutUserData, void (*ErrSaver)(const char *pErrMsg, void *ErrUserData),
++    void *ErrUserData) {
++  LLVMContext Context;
++  StringRef SpirvInput = StringRef(pIn, InSz);
++  std::istringstream IS(SpirvInput.str());
++
++  std::unique_ptr<llvm::Module> M;
++  {
++    llvm::Module *SpirM;
++    std::string ErrMsg;
++    auto Opts = GetTranslatorOpts();
++    // This returns true on success...
++    bool Status = llvm::readSpirv(Context, Opts, IS, SpirM, ErrMsg);
++    if (!Status) {
++      std::ostringstream OSS;
++      OSS << "spirv_read_verify: readSpirv failed: " << ErrMsg;
++      ErrSaver(OSS.str().c_str(), ErrUserData);
++      return -1;
++    }
++
++    Status = llvm::verifyModule(*SpirM);
++    if (Status) {
++      ErrSaver("spirv_read_verify: verify Module failed", ErrUserData);
++      return -1;
++    }
++
++    M.reset(SpirM);
++  }
++
++  llvm::SmallVector<char, 16> CloneBuffer;
++  llvm::raw_svector_ostream CloneOstream(CloneBuffer);
++  WriteBitcodeToFile(*M, CloneOstream);
++
++  assert(CloneBuffer.size() > 0);
++
++  OutSaver(CloneBuffer.data(), CloneBuffer.size(), OutUserData);
++  return 0;
++}
+diff --git a/lib/SPIRV/VCExport.h b/lib/SPIRV/VCExport.h
+new file mode 100755
+index 0000000..3b989ed
+--- /dev/null
++++ b/lib/SPIRV/VCExport.h
+@@ -0,0 +1,28 @@
++//===- VCExport.h - Adding possibility to build spirv as a dll -*- C++ -*-===//
++//
++//                     The LLVM/SPIR-V Translator
++//
++//===----------------------------------------------------------------------===//
++//
++// This file is kind of a temporal solution
++// We need to live in separate DLL while IGC default SPIRV is not ready
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef SPIRV_VCEXPORT_H
++#define SPIRV_VCEXPORT_H
++
++#ifdef _WIN32
++#define __EXPORT__ __declspec(dllexport)
++#else
++#define __EXPORT__ __attribute__((visibility("default")))
++#endif
++
++// Returns zero on success.
++extern "C" __EXPORT__ int spirv_read_verify_module(
++    const char *pIn, size_t InSz,
++    void (*OutSaver)(const char *pOut, size_t OutSize, void *OutUserData),
++    void *OutUserData, void (*ErrSaver)(const char *pErrMsg, void *ErrUserData),
++    void *ErrUserData);
++
++#endif // SPIRV_VCEXPORT_H
+-- 
+2.17.1
+
diff --git a/IGC/VectorCompiler/spirv-patches-new/0004-Remove-LLVMSPIRVLib-from-targets-Rename-tool-llvm-sp.patch b/IGC/VectorCompiler/spirv-patches-new/0004-Remove-LLVMSPIRVLib-from-targets-Rename-tool-llvm-sp.patch
new file mode 100644
index 000000000000..b9deb556bf91
--- /dev/null
+++ b/IGC/VectorCompiler/spirv-patches-new/0004-Remove-LLVMSPIRVLib-from-targets-Rename-tool-llvm-sp.patch
@@ -0,0 +1,107 @@
+From 7be7da38da84bd1a5af4e881f8ff3d0a590b8326 Mon Sep 17 00:00:00 2001
+From: nrudenko <nikita.rudenko@intel.com>
+Date: Thu, 11 Jun 2020 15:58:34 +0300
+Subject: [PATCH 4/4] Remove LLVMSPIRVLib from targets Rename tool llvm-spirv
+ to llvm-spirv-vc
+
+---
+ lib/SPIRV/CMakeLists.txt        | 21 ---------------------
+ test/CMakeLists.txt             | 12 ------------
+ tools/llvm-spirv/CMakeLists.txt |  8 ++------
+ 3 files changed, 2 insertions(+), 39 deletions(-)
+
+diff --git a/lib/SPIRV/CMakeLists.txt b/lib/SPIRV/CMakeLists.txt
+index 5f5b072..7a54f61 100755
+--- a/lib/SPIRV/CMakeLists.txt
++++ b/lib/SPIRV/CMakeLists.txt
+@@ -37,18 +37,6 @@ set(SPIRV_SOURCES
+   libSPIRV/SPIRVValue.cpp
+ )
+ 
+-add_llvm_library(LLVMSPIRVLib
+-  ${SPIRV_SOURCES}
+-  LINK_COMPONENTS
+-    Analysis
+-    BitWriter
+-    Core
+-    Support
+-    TransformUtils
+-  DEPENDS
+-    intrinsics_gen
+-)
+-
+ # --- mock: add_llvm_library(SPIRVDLL MODULE ---
+ # unfortunately this do not work for llvm build system as is so some magic below
+ 
+@@ -74,15 +62,6 @@ set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS SPIRVDLL)
+ 
+ # --- end mock ---
+ 
+-target_include_directories(LLVMSPIRVLib
+-  PRIVATE
+-    ${LLVM_INCLUDE_DIRS}
+-    ${LLVM_SPIRV_INCLUDE_DIRS}
+-    ${CMAKE_CURRENT_SOURCE_DIR}
+-    ${CMAKE_CURRENT_SOURCE_DIR}/libSPIRV
+-    ${CMAKE_CURRENT_SOURCE_DIR}/Mangler
+-)
+-
+ target_include_directories(SPIRVDLL
+   PRIVATE
+     ${LLVM_INCLUDE_DIRS}
+diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
+index 3348c03..99c3a63 100644
+--- a/test/CMakeLists.txt
++++ b/test/CMakeLists.txt
+@@ -66,14 +66,6 @@ if(NOT LLVM_SPIRV_BUILD_EXTERNAL)
+ endif(NOT LLVM_SPIRV_BUILD_EXTERNAL)
+ 
+ 
+-add_lit_testsuite(check-llvm-spirv "Running the LLVM-SPIRV regression tests"
+-  ${CMAKE_CURRENT_BINARY_DIR}
+-  ARGS
+-    --verbose
+-  DEPENDS
+-    ${LLVM_SPIRV_TEST_DEPS}
+-    llvm-spirv
+-)
+ 
+ # to enable a custom test target on cmake below 3.11
+ # starting with 3.11 "test" is only reserved if ENABLE_TESTING(ON)
+@@ -82,9 +74,5 @@ if(LLVM_SPIRV_BUILD_EXTERNAL)
+   if(POLICY CMP0037 AND ${CMAKE_VERSION} VERSION_LESS "3.11.0")
+     cmake_policy(SET CMP0037 OLD)
+   endif(POLICY CMP0037 AND ${CMAKE_VERSION} VERSION_LESS "3.11.0")
+-  add_custom_target(test
+-    DEPENDS
+-      check-llvm-spirv
+-  )
+   cmake_policy(POP)
+ endif(LLVM_SPIRV_BUILD_EXTERNAL)
+diff --git a/tools/llvm-spirv/CMakeLists.txt b/tools/llvm-spirv/CMakeLists.txt
+index 9aa96d9..3130b92 100644
+--- a/tools/llvm-spirv/CMakeLists.txt
++++ b/tools/llvm-spirv/CMakeLists.txt
+@@ -8,17 +8,13 @@ set(LLVM_LINK_COMPONENTS
+   TransformUtils
+ )
+ 
+-add_llvm_tool(llvm-spirv
++add_llvm_tool(llvm-spirv-vc
+   llvm-spirv.cpp
+   # llvm_setup_rpath messes with the rpath making llvm-spirv not executable from the build directory
+   NO_INSTALL_RPATH
+ )
+ 
+-if (LLVM_SPIRV_BUILD_EXTERNAL)
+-  target_link_libraries(llvm-spirv PRIVATE LLVMSPIRVLib)
+-endif()
+-
+-target_include_directories(llvm-spirv
++target_include_directories(llvm-spirv-vc
+   PRIVATE
+     ${LLVM_INCLUDE_DIRS}
+     ${LLVM_SPIRV_INCLUDE_DIRS}
+-- 
+2.17.1
+
diff --git a/IGC/VectorCompiler/tests/vctest_config.yml b/IGC/VectorCompiler/tests/vctest_config.yml
new file mode 100644
index 000000000000..3e9f97e61bee
--- /dev/null
+++ b/IGC/VectorCompiler/tests/vctest_config.yml
@@ -0,0 +1,2 @@
+---
+version: 64
diff --git a/IGC/VectorCompiler/unittests/CMakeLists.txt b/IGC/VectorCompiler/unittests/CMakeLists.txt
new file mode 100644
index 000000000000..d57a09609259
--- /dev/null
+++ b/IGC/VectorCompiler/unittests/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_custom_target(GenXUnitTests)
+set_target_properties(GenXUnitTests PROPERTIES FOLDER "GenXTests")
+
+function(add_genx_unittest test_dirname)
+  add_unittest(GenXUnitTests ${test_dirname} ${ARGN})
+endfunction()
+
+add_subdirectory(SPIRVConversions)
+add_subdirectory(Regions)
diff --git a/IGC/VectorCompiler/unittests/Regions/CMakeLists.txt b/IGC/VectorCompiler/unittests/Regions/CMakeLists.txt
new file mode 100644
index 000000000000..11e7cd4e101b
--- /dev/null
+++ b/IGC/VectorCompiler/unittests/Regions/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  Support
+  CodeGen
+  GenXCodeGen
+  GenXOpts
+  )
+
+add_genx_unittest(RegionsTests
+  OverlapTest.cpp
+  )
+
+
+target_include_directories(RegionsTests PRIVATE  "${CMAKE_CURRENT_SOURCE_DIR}/../../lib/GenXCodeGen")
+target_link_libraries(RegionsTests PRIVATE LLVMTestingSupport)
+
+
diff --git a/IGC/VectorCompiler/unittests/Regions/OverlapTest.cpp b/IGC/VectorCompiler/unittests/Regions/OverlapTest.cpp
new file mode 100644
index 000000000000..8f95de6470a1
--- /dev/null
+++ b/IGC/VectorCompiler/unittests/Regions/OverlapTest.cpp
@@ -0,0 +1,81 @@
+//===- llvm/unittest/GenXIntrinsics/GenXIntrinsicsTest.cpp - --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+
+#include "GenXRegion.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+TEST(GenXCodeGen, RegionOverlapping) {
+  LLVMContext Context;
+
+  genx::Region R1(VectorType::get(Type::getDoubleTy(Context), 16));
+  R1.VStride = 0;
+  R1.NumElements = R1.Width = 16;
+  R1.Stride = 1;
+  R1.Offset = 128;
+  genx::Region R2(VectorType::get(Type::getDoubleTy(Context), 8));
+  R2.VStride = 0;
+  R2.NumElements = R2.Width = 8;
+  R2.Stride = 1;
+  R2.Offset = 192;
+  EXPECT_EQ(R1.overlap(R2), true);
+  R2.Offset = 256;
+  EXPECT_EQ(R2.overlap(R1), false);
+
+  genx::Region R3(VectorType::get(Type::getInt32Ty(Context), 4));
+  R3.VStride = 2;
+  R3.NumElements = 8;
+  R3.Width = 1;
+  R3.Stride = 0;
+  R3.Offset = 0;
+  genx::Region R4(R3);
+  EXPECT_EQ(R3.overlap(R4), true);
+  R4.Offset = R4.ElementBytes;
+  EXPECT_EQ(R3.overlap(R4), false);
+  R4.Offset = R4.ElementBytes * 2;
+  EXPECT_EQ(R3.overlap(R4), true);
+  R4.Offset = 6;
+  EXPECT_EQ(R3.overlap(R4), true);
+
+  genx::Region R5(VectorType::get(Type::getInt16Ty(Context), 4));
+  R5.VStride = 8;
+  R5.NumElements = 4;
+  R5.Width = 2;
+  R5.Stride = 1;
+  R5.Offset = 0;
+  genx::Region R6(R5);
+  R6.Offset = R6.ElementBytes;
+  EXPECT_EQ(R5.overlap(R6), true);
+  R6.Offset = R6.ElementBytes * 2;
+  EXPECT_EQ(R5.overlap(R6), false);
+
+  genx::Region R7(VectorType::get(Type::getDoubleTy(Context), 128));
+  R7.VStride = 32;
+  R7.NumElements = 128;
+  R7.Width = 8;
+  R7.Stride = 2;
+  R7.Offset = 0;
+  genx::Region R8(VectorType::get(Type::getInt32Ty(Context), 256));
+  R8.VStride = 1;
+  R8.Width = R8.NumElements = 128;
+  R8.Stride = 4;
+  R8.Offset = R7.ElementBytes;
+  EXPECT_EQ(R7.overlap(R8), false);
+  R8.Offset--;
+  EXPECT_EQ(R7.overlap(R8), true);
+}
+
+} // namespace
diff --git a/IGC/VectorCompiler/unittests/SPIRVConversions/CMakeLists.txt b/IGC/VectorCompiler/unittests/SPIRVConversions/CMakeLists.txt
new file mode 100644
index 000000000000..f9a678d76937
--- /dev/null
+++ b/IGC/VectorCompiler/unittests/SPIRVConversions/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  Support
+  CodeGen
+  GenXIntrinsics
+  SPIRVLib
+  )
+
+add_genx_unittest(SPIRVConversionsTests
+  SPIRVConversionsTest.cpp
+  )
+
+target_include_directories(SPIRVConversionsTests PRIVATE ${SPIRV_INCLUDE_DIR})
+target_link_libraries(SPIRVConversionsTests PRIVATE LLVMTestingSupport)
+
+
diff --git a/IGC/VectorCompiler/unittests/SPIRVConversions/SPIRVConversionsTest.cpp b/IGC/VectorCompiler/unittests/SPIRVConversions/SPIRVConversionsTest.cpp
new file mode 100644
index 000000000000..b08ee54c4db8
--- /dev/null
+++ b/IGC/VectorCompiler/unittests/SPIRVConversions/SPIRVConversionsTest.cpp
@@ -0,0 +1,255 @@
+/*===================== begin_copyright_notice ==================================
+
+Copyright (c) 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+======================= end_copyright_notice ==================================*/
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Error.h"
+
+#include "LLVMSPIRVLib.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "gtest/gtest.h"
+
+
+#include <strstream>
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+static GenXIntrinsic::ID BeginGenXID = llvm::GenXIntrinsic::genx_3d_load;
+static GenXIntrinsic::ID EndGenXID = llvm::GenXIntrinsic::genx_zzzzend;
+
+// Currently returns some fixed types.
+Type *generateAnyType(Intrinsic::IITDescriptor::ArgKind AK, LLVMContext &Ctx) {
+  using namespace Intrinsic;
+
+  switch (AK) {
+  case IITDescriptor::AK_Any:
+  case IITDescriptor::AK_AnyInteger:
+    return Type::getInt32Ty(Ctx);
+  case IITDescriptor::AK_AnyFloat:
+    return Type::getDoubleTy(Ctx);
+  case IITDescriptor::AK_AnyPointer:
+    return Type::getInt32PtrTy(Ctx);
+  case IITDescriptor::AK_AnyVector:
+    return VectorType::get(Type::getInt32Ty(Ctx), 8);
+  }
+  llvm_unreachable("All types should be handled");
+}
+
+void generateOverloadedTypes(GenXIntrinsic::ID Id, LLVMContext &Ctx,
+                             SmallVectorImpl<Type *> &Tys) {
+  using namespace Intrinsic;
+
+  SmallVector<IITDescriptor, 8> Table;
+  GenXIntrinsic::getIntrinsicInfoTableEntries(Id, Table);
+
+  for (unsigned i = 0, e = Table.size(); i != e; ++i) {
+    auto Desc = Table[i];
+    if (Desc.Kind != IITDescriptor::Argument)
+      continue;
+
+    size_t ArgNum = Desc.getArgumentNumber();
+    Tys.resize(std::max(ArgNum + 1, Tys.size()));
+
+    Tys[ArgNum] = generateAnyType(Desc.getArgumentKind(), Ctx);
+  }
+}
+
+static std::string ty2s(Type* ty) {
+  std::string type_str;
+  llvm::raw_string_ostream rso(type_str);
+  ty->print(rso, true);
+  return rso.str();
+}
+static std::string k2s(std::map<std::string, Attribute::AttrKind>& s,
+                       Attribute::AttrKind kkk) {
+  for (const auto& i: s) {
+    if (i.second == kkk)
+      return i.first;
+  }
+  return "n/a";
+}
+class SpirvConvertionsTest : public testing::Test {
+protected:
+  void SetUp() override {
+    M_.reset(new Module("Test_Module", Ctx_));
+    M_->setTargetTriple("spir64-unknown-unknown");
+  }
+
+  void TearDown() override {
+    M_.reset();
+  }
+
+  Module* Retranslate(LLVMContext& ctx, std::string& err) {
+    err.clear();
+    std::stringstream ss;
+    writeSpirv(M_.get(), ss, err);
+
+    if (!err.empty())
+      return nullptr;
+
+    std::string s_sv_ir = ss.str();
+    std::istrstream ir_stream(s_sv_ir.data(), s_sv_ir.size());
+
+    Module* result = nullptr;
+    readSpirv(ctx, ir_stream, result, err);
+
+    if (!err.empty())
+      return nullptr;
+
+    return result;
+  }
+
+  LLVMContext Ctx_;
+  std::unique_ptr<Module> M_;
+  std::set<std::string> FN_;
+};
+
+TEST_F(SpirvConvertionsTest, IntrinsicAttrs) {
+  Type *FArgTy[] = {Type::getInt32PtrTy(Ctx_)};
+  FunctionType *FT = FunctionType::get(Type::getVoidTy(Ctx_), FArgTy, false);
+  Function *F = Function::Create(FT, Function::ExternalLinkage, "", M_.get());
+  BasicBlock *BB = BasicBlock::Create(Ctx_, "", F);
+
+  IRBuilder<> Builder(BB);
+
+  for (unsigned id = BeginGenXID; id < EndGenXID; ++id) {
+    GenXIntrinsic::ID XID = static_cast<GenXIntrinsic::ID>(id);
+
+    SmallVector<Type *, 8> Tyss;
+    generateOverloadedTypes(XID, Ctx_, Tyss);
+
+    Function* f = GenXIntrinsic::getGenXDeclaration(M_.get(), XID, Tyss);
+    SmallVector<Value *, 8> Args;
+    for (Type* ty: f->getFunctionType()->params()) {
+      Value* arg = llvm::Constant::getNullValue(ty);
+      Args.push_back(arg);
+
+      FN_.insert(f->getName().str());
+      /*
+      std::cout << "name: " << f->getName().str() << "\n";
+      Type* aty = arg->getType();
+      std::cout << "    param_type: " << ty2s(ty) << ' ' << (void*)ty <<  "\n";
+      std::cout << "    arg_type: " << ty2s(aty) << ' ' << (void*)aty <<  "\n";
+      */
+    }
+    Builder.CreateCall(f, Args);
+  }
+  llvm::Error merr = M_->materializeAll();
+  if (merr)
+    FAIL() << "materialization a module resulted in failure: " << merr << "\n";
+
+  std::string err;
+  LLVMContext C;
+  Module* M = Retranslate(C, err);
+  if (!M) {
+    FAIL() << "failure during retranslation: " << err << "\n";
+    return;
+  }
+
+  // M_->dump();
+  // M->dump();
+
+  for (const std::string& fname :FN_) {
+    // std::cout << "processing <" << fname << ">" << "\n";
+    Function* fl = M->getFunction(fname);
+    Function* fr = M_->getFunction(fname);
+
+    if (!fl)
+      FAIL() << "could not find <" << fname << "> in the converted Module\n";
+    if (!fr)
+      FAIL() << "could not find <" << fname << "> in the original Module\n";
+
+    // fl->getAttributes().dump();
+    // fr->getAttributes().dump();
+
+    for (unsigned i = Attribute::None; i < Attribute::EndAttrKinds; ++i) {
+      Attribute::AttrKind att = (Attribute::AttrKind)i;
+      EXPECT_TRUE(fl->hasFnAttribute(att) == fr->hasFnAttribute(att));
+    }
+  }
+}
+
+TEST_F(SpirvConvertionsTest, FunctionAttrs) {
+
+  // TODO: think about how one can test all attributes. Right now the problem
+  // is that I don't know how to diffirentiate between attributes which require
+  // a value from those that don't.
+  std::map<std::string, Attribute::AttrKind> kinds = {
+    { "Convergent", Attribute::Convergent },
+    { "NoReturn", Attribute::NoReturn },
+    { "NoInline", Attribute::NoInline },
+    { "NoUnwind", Attribute::NoUnwind },
+    { "ReadNone", Attribute::ReadNone },
+    { "SafeStack", Attribute::SafeStack },
+    { "WriteOnly", Attribute::WriteOnly },
+  };
+  for (const auto& k : kinds) {
+    Type *FArgTy[] = {Type::getInt32PtrTy(Ctx_)};
+    FunctionType *FT = FunctionType::get(Type::getVoidTy(Ctx_), FArgTy, false);
+    Function* test_f =
+      Function::Create(FT, Function::ExternalLinkage, k.first, M_.get());
+    for (unsigned i = Attribute::None; i < Attribute::EndAttrKinds; ++i) {
+      if (test_f->hasFnAttribute((Attribute::AttrKind)i)) {
+        test_f->removeFnAttr((Attribute::AttrKind)i);
+      }
+    }
+    test_f->addFnAttr(k.second);
+    BasicBlock *aux_BB = BasicBlock::Create(Ctx_, "", test_f);
+    IRBuilder<> aux_Builder(aux_BB);
+  }
+
+  std::string err;
+  LLVMContext C;
+  Module* M = Retranslate(C, err);
+  if (!M) {
+    FAIL() << "failure during retranslation: " << err << "\n";
+    return;
+  }
+  for (const auto& k : kinds) {
+    Function* fl = M->getFunction(k.first);
+    Function* fr = M_->getFunction(k.first);
+    for (unsigned i = Attribute::None; i < Attribute::EndAttrKinds; ++i) {
+      Attribute::AttrKind att = (Attribute::AttrKind)i;
+      if ((fl->hasFnAttribute(att) != fr->hasFnAttribute(att))) {
+        FAIL() << "Attriubute mismatch for <" << k.first << "> at attr:" <<
+            i << " (" << k2s(kinds, att) << ")\n";
+      }
+    }
+  }
+  // M_->dump();
+  // M->dump();
+}
+
+
+} // namespace
diff --git a/IGC/common/igc_flags.def b/IGC/common/igc_flags.def
index e517fbbde644..106b5bb6dd6b 100644
--- a/IGC/common/igc_flags.def
+++ b/IGC/common/igc_flags.def
@@ -407,3 +407,5 @@ DECLARE_IGC_REGKEY(bool, ApplyConservativeRastWAHeader, true, "Apply WaConservat
 DECLARE_IGC_GROUP("OGL Frontend")
     DECLARE_IGC_REGKEY(bool, OGLMinimumDump, false, "Minimum dump for testing - first and last .ll, .cos and compiler output", true)
 
+DECLARE_IGC_GROUP("VectorCompiler Options")
+    DECLARE_IGC_REGKEY(bool, VCOptimizeNone, false, "Same as -optimize=none in vector compiler options", true)