update the scan and the copy examples

Change-Id: Idd886e2bfbd0b2df7997d6be3b2eba46c59f1941
intel · Jan 8, 2020 · 2c05d2b · 2c05d2b
1 parent 06b0d73
commit 2c05d2b
Show file tree

Hide file tree

Showing 32 changed files with 2,107 additions and 681 deletions.
diff --git a/test/external_contribution/CopyPipelineHigh6/CPipeline.cpp b/test/external_contribution/CopyPipelineHigh6/CPipeline.cpp
diff --git a/...tion/ScanToJpeg/Rgb2Encode/Rgb2Encode.cpp → ...neHigh6/ErrorDiffusion/ErrorDiffusion.cpp b/...tion/ScanToJpeg/Rgb2Encode/Rgb2Encode.cpp → ...neHigh6/ErrorDiffusion/ErrorDiffusion.cpp
@@ -20,43 +20,38 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 #include "cm_rt.h"
-#include "Rgb2Encode.h"
+#include "ErrorDiffusion.h"
 
 #define BLOCK_WIDTH     8
-#define BLOCK_HEIGHT    8
+#define BLOCK_HEIGHT    4
 
-Rgb2Encode::Rgb2Encode(CmDevice *pdevice)
+ErrorDiffusion::ErrorDiffusion(CmDevice *pdevice)
 {
    m_pCmDev = pdevice;
    m_pKernel   = NULL;
-
-   float coefficients[9] = {0.299, 0.587, 0.114, -0.168736, -0.331264, 0.5, 0.5, -0.418688, -0.081312};
-   std::copy(coefficients, coefficients + 9, m_coeffs);
-
-   float offsets[3] = {0.0, 128.0, 128.0};
-   std::copy(offsets, offsets + 3, m_offsets);
 }
 
-char * Rgb2Encode::GetKernelName(const int yuvFormat)
+ErrorDiffusion::~ErrorDiffusion(void)
 {
-   if (yuvFormat == 0) // YUV444
-      return "Rgb2YCbCr_GENX";
-   else if (yuvFormat == 1) // YUV400 or grayscale
-      return "Rgb2Y8_GENX";
+   free(m_pErrRowBuf);
+   free(m_pErrColBuf);
 }
 
-int Rgb2Encode::PreRun(
+int ErrorDiffusion::PreRun(
       CmKernel      *pKernel,
-      SurfaceIndex  *pSI_SrcSurfR,
-      SurfaceIndex  *pSI_SrcSurfG,
-      SurfaceIndex  *pSI_SrcSurfB,
-      SurfaceIndex  *pSI_DstSurf,
+      SurfaceIndex  *pSI_SrcSurfCMYK,
+      SurfaceIndex  *pSI_DstSurfCMYK,
       int            nPicWidth,
       int            nPicHeight
       )
 {
    int   result;
+
    CmThreadSpace *pTS       = NULL;
+   CmBuffer *pErrRowBuf = NULL;
+   CmBuffer *pErrColBuf = NULL;
+   SurfaceIndex *pSI_ErrRowBuf = NULL;
+   SurfaceIndex *pSI_ErrColBuf = NULL;
 
    int nPicWidthInBlk, nPicHeightInBlk;
    int nKernelInput;
@@ -66,16 +61,33 @@ int Rgb2Encode::PreRun(
    nPicWidthInBlk = (nPicWidth + BLOCK_WIDTH - 1) / BLOCK_WIDTH;
    nPicHeightInBlk = (nPicHeight + BLOCK_HEIGHT - 1) / BLOCK_HEIGHT;
 
-   CM_Error_Handle(m_pCmDev->CreateThreadSpace(nPicWidthInBlk, nPicHeightInBlk, pTS), "CreateThreadSpace Error");
+   CM_Error_Handle(m_pCmDev->CreateThreadSpace((nPicWidthInBlk+1), nPicHeightInBlk, pTS), "CreateThreadGroupSpace Error");
+
+   CM_Error_Handle(pTS->SelectThreadDependencyPattern(CM_WAVEFRONT26), "Select thread dependency error");
+
+   int rowErrBufferSize = (nPicWidthInBlk + 1) * 64;
+   uchar *pRowErr = (uchar *) CM_ALIGNED_MALLOC(rowErrBufferSize, 0x1000);
+   memset(pRowErr, 0, rowErrBufferSize);
+
+   int colErrBufferSize = nPicHeightInBlk * 128;
+   uchar *pColErr = (uchar *) CM_ALIGNED_MALLOC(colErrBufferSize, 0x1000);
+   memset(pColErr, 0, colErrBufferSize);
+
+   CM_Error_Handle(m_pCmDev->CreateBuffer(rowErrBufferSize, pErrRowBuf),
+         "CreateBuffer Error");
+   pErrRowBuf->GetIndex(pSI_ErrRowBuf);
+
+   CM_Error_Handle(m_pCmDev->CreateBuffer(colErrBufferSize, pErrColBuf),
+         "CreateBuffer Error");
+   pErrColBuf->GetIndex(pSI_ErrColBuf);
 
    // Set up kernel args for Pipeline filter
    nKernelInput = 0;
-   result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_SrcSurfR);
-   result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_SrcSurfG);
-   result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_SrcSurfB);
-   result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_DstSurf);
-   result = m_pKernel->SetKernelArg(nKernelInput++, 9*sizeof(float), &m_coeffs[0]);
-   result = m_pKernel->SetKernelArg(nKernelInput++, 3*sizeof(float), &m_offsets[0]);
+   result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_SrcSurfCMYK);
+   result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_ErrRowBuf);
+   result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_ErrColBuf);
+   result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_DstSurfCMYK);
+   result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(int), &nPicWidthInBlk );
    result = m_pKernel->AssociateThreadSpace(pTS);
 
    CM_Error_Handle(result, "SetKernelArg Error");

diff --git a/test/external_contribution/CopyPipelineHigh6/ErrorDiffusion/ErrorDiffusion_genx.cpp b/test/external_contribution/CopyPipelineHigh6/ErrorDiffusion/ErrorDiffusion_genx.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <cm/cm.h>
+
+#define BLKW 8
+#define BLKH 4
+#define ITERATION 4
+
+#define X 0
+#define Y 1
+
+/* 
+ * Error diffusion push error to neighbour pixels, and result the algorithm need
+ * to run in sequence.  Using following pixel processing order allow parallelism
+ * but not violate the dependency issue. 
+ * Using pixelRowIndex and pixelColIndex to determine the pixel sequence to
+ * process the error diffusion
+ *
+ * Some pixels can process together, but some can't.  To reduce the conditional
+ * checking, if the pixel can't parallel with other pixel, repeat to process the
+ * same pixel.  For SIMD, it isn't any performance penalty
+ */
+
+short pixelLocX[32] ={ 0,1,2,3,4,5,6,7,-2,-1,0,1,2,3,4,5,
+                      -4,-3,-2,-1,0,1,2,3,-6,-5,-4,-3,-2,-1,0,1
+                     };
+short pixelLocY[32] ={ 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,
+                      2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3
+                     };
+uchar weight[4] = {1,5,3,7};
+
+extern "C" _GENX_MAIN_ void
+ErrorDiffusion_GENX(
+   SurfaceIndex SrcSI,
+   SurfaceIndex ErrBufRowSI,
+   SurfaceIndex ErrBufColSI,
+   SurfaceIndex DstSI,
+   int widthBlk
+)
+{
+   matrix<uint, BLKH, 8> input;
+   matrix<uint, BLKH, 8> output;
+   matrix<short, 5, 64> errBuf = 0;
+   vector<short, 64> errRowTemp;
+   vector<short, 2> pos, pos4;
+   vector<uint, 32> inputVectorX(pixelLocX);
+   vector<uint, 32> inputVectorY(pixelLocY);
+   matrix<uint, 4, 16> tmpInput;
+
+   cm_wait();
+
+   pos(X) = get_thread_origin_x();
+   pos(Y) = get_thread_origin_y();
+
+   pos4(X) = pos(X) * BLKW;
+   pos4(Y) = pos(Y) * BLKH;
+
+   inputVectorX += pos4(X);
+   inputVectorY += pos4(Y);
+
+   read_typed(SrcSI, CM_R_ENABLE, tmpInput.select<1,1,16,1>(0,0),
+         inputVectorX.select<16,1>(0), inputVectorY.select<16,1>(0));
+   input.select<1,1,8,1>(0,0) = tmpInput.select<1,1,8,1>(0,0);
+   input.select<1,1,8,1>(1,0) = tmpInput.select<1,1,8,1>(0,8);
+
+   read_typed(SrcSI, CM_R_ENABLE, tmpInput.select<1,1,16,1>(0,0),
+         inputVectorX.select<16,1>(16), inputVectorY.select<16,1>(16));
+   input.select<1,1,8,1>(2,0) = tmpInput.select<1,1,8,1>(0,0);
+   input.select<1,1,8,1>(3,0) = tmpInput.select<1,1,8,1>(0,8);
+
+   // Need to update, try to use single column for column error buffer and
+   // single row for row buffer (single row for row buffer may not possible)
+   int colErrBufOffset = (pos(Y))*128;
+   int rowErrBufOffset = (pos(X)+1)*64-24;
+
+
+   if (pos(Y) != 0)
+      read(DWALIGNED(ErrBufRowSI), rowErrBufOffset, errRowTemp.select<40,1>(0));
+   else
+      errRowTemp.select<40,1>(0) = 0;
+
+   errBuf.select<1,1,40,1>(0,24) = errRowTemp.select<40,1>(0);
+
+   if (pos(X) != 0)
+      read(ErrBufColSI, colErrBufOffset , errRowTemp.select<40,1>(0));
+   else
+      errRowTemp.select<40,1>(0) = 0;
+
+   errBuf.select<1,1,12,1>(1,16) = errRowTemp.select<12,1>(0);
+   errBuf.select<1,1,12,1>(2,8) = errRowTemp.select<12,1>(12);
+   errBuf.select<1,1,12,1>(3,0) = errRowTemp.select<12,1>(24);
+   errBuf.select<1,1,4,1>(4,0) = errRowTemp.select<4,1>(36);
+
+   // The read extra one column to left and right, so, it will directly map into
+   // error buffer
+   //
+   // The input buffer is a diaganol box, but we read a square to wrap this box
+
+   // The read extra one column to left and right, so, it will directly map into
+   // error buffer
+   // The input buffer is a diaganol box, but we read a square to wrap this box
+   //       ________
+   //     _|       _|
+   //   _|       _|
+   // _|       _|
+   //|_______ |
+   //
+   vector<uint, 8> oldpixel, newpixel;
+
+   vector_ref<uchar, 32> oldpixelc = oldpixel.format<uchar>();
+   vector_ref<uchar, 32> newpixelc = newpixel.format<uchar>();
+   vector<float, 32> fxs;
+
+   vector<float, 4> weightf(weight);
+
+   weightf =weightf/16.0f;
+
+   // this is error buffer size matrix<short, 5, 64> errBuf = 0;
+   int colIndex, rowIndex;
+#pragma unroll
+   for (int i=0; i < ITERATION; i++)
+   {
+      fxs = 0;
+      // oldpixel should be from input one row
+      oldpixel = input.select<1,1,8,1>(i,0);
+
+      colIndex = 24 - i*8;
+      rowIndex = i;
+      if (pos(X) == widthBlk)
+           errBuf.select<5,1,36,1>(0, 28) = 0;
+
+      fxs += errBuf.select<1,1,32,1>(rowIndex,colIndex) * weightf[0];
+
+      fxs += errBuf.select<1,1,32,1>(rowIndex,colIndex+4) * weightf[1];
+
+      fxs += errBuf.select<1,1,32,1>(rowIndex,colIndex+8) * weightf[2];
+
+      if (pos(X) == 0)
+      {
+         if (i == 1)
+            fxs.select<8,1>(0)  = 0;
+         else if (i == 2)
+            fxs.select<16,1>(0) =  0;
+         else if (i == 3)
+            fxs.select<24,1>(0) = 0;
+      }
+
+      rowIndex += 1;
+
+      vector<float, 4> pixelleft (errBuf.select<1,1,4,1>(rowIndex, colIndex));
+      vector<float, 4> singlefxs;
+      int rowi;
+
+#pragma unroll
+      for (int ii=0; ii < 8; ii++)
+      {
+         rowi = ii*4;
+
+         singlefxs = pixelleft * weightf[3];
+
+         singlefxs = singlefxs + fxs.select<4,1>(rowi) +
+            oldpixelc.select<4,1>(rowi);
+
+         newpixelc.select<4,1>(rowi) = (singlefxs > 127.0f)* 0xff;
+         fxs.select<4,1>(rowi) = cm_rndd<float>(singlefxs - newpixelc.select<4,1>(rowi));
+         pixelleft = fxs.select<4,1>(rowi);
+
+      }
+
+      // Temporary workaround the right border incorrect data
+      if (pos(X) == widthBlk)
+      {
+         if (i == 1)
+           fxs.select<24,1>(8) = 0;
+         else if (i == 2)
+            fxs.select<16,1>(16) = 0;
+         else if (i == 3)
+            fxs.select<8,1>(24) = 0;
+      }
+      errBuf.select<1,1,32,1>(rowIndex,colIndex+4) = fxs;
+
+      output.select<1,1,8,1>(rowIndex-1, 0) = newpixel;
+
+   }
+
+   //output write need to use scatter write as it index is 
+   matrix<uint, 4, 16> tmpOut;
+
+   tmpOut.select<1,1,8,1>(0,0) = output.select<1,1,8,1>(0,0);
+   tmpOut.select<1,1,8,1>(0,8) = output.select<1,1,8,1>(1,0);
+
+   write_typed(DstSI, CM_R_ENABLE, tmpOut,
+         inputVectorX.select<16,1>(0), inputVectorY.select<16,1>(0));
+
+   tmpOut.select<1,1,8,1>(0,0) = output.select<1,1,8,1>(2,0);
+   tmpOut.select<1,1,8,1>(0,8) = output.select<1,1,8,1>(3,0);
+   write_typed(DstSI, CM_R_ENABLE, tmpOut,
+         inputVectorX.select<16,1>(16), inputVectorY.select<16,1>(16));
+
+   vector<short, 64> errBufColOut;
+
+   errBufColOut.select<12,1>(0) = errBuf.select<1,1,12,1>(1,48);
+   errBufColOut.select<12,1>(12) = errBuf.select<1,1,12,1>(2,40);
+   errBufColOut.select<12,1>(24) = errBuf.select<1,1,12,1>(3,32);
+   errBufColOut.select<4,1>(36) = errBuf.select<1,1,4,1>(4,32);
+
+   colErrBufOffset = (pos(Y))*128;
+   rowErrBufOffset = (pos(X))*64;
+
+   write(ErrBufRowSI, rowErrBufOffset, errBuf.select<1,1,32,1>(4,4).format<short>());
+   write(ErrBufColSI, colErrBufOffset, errBufColOut.select<64,1>(0));
+
+   cm_fence();
+   cm_signal();
+}
diff --git a/test/external_contribution/CopyPipelineHigh6/Makefile.linux b/test/external_contribution/CopyPipelineHigh6/Makefile.linux
@@ -3,7 +3,7 @@
 ###########################################################################
 
 #macro CM_ROOT and APP_NAME are passed as Make parameters
-CM_ROOT := /home/gangche1/Linux_C_for_Media_Development_Package_20181022
+#CM_ROOT := /home/gangche1/Linux_C_for_Media_Development_Package_20181022
 #CMC := ../../../build.64.linux/bin/cmc
 #CM_ROOT := ../..
 CMC := $(CM_ROOT)/compiler/bin/cmc
@@ -20,7 +20,7 @@ HOST_FILE = $(filter-out $(KERNEL_FILE), $(ALL_FILE))
 APP         := $(APP_NAME)
 
 CXX           := g++
-INCL          := -I$(CM_ROOT)/runtime/include -I.. -Iinclude
+INCL          := -I$(CM_ROOT)/runtime/include -I.. -Iinclude -I$(CM_ROOT)/examples
 CXXFLAGS      := -w -g ${INCL} -msse4.1 -D__LINUX__ -DLINUX -O0 -std=gnu++11 -fPIC -c -DCM_$(subst gen,GEN,$(GEN_MODE)) -rdynamic -ffloat-store -D_DEBUG 
 
 #.PHONY: clean, hw_x64, all
@@ -42,7 +42,7 @@ HW_X64_APP_OBJS := $(HW_APP_SOURCES:.cpp=.o)
 
 hw_x64: $(HW_X64_APP) $(HW_X64_ISA)
 $(HW_X64_APP): $(HW_X64_APP_OBJS)
-	$(CXX) $^ $(HW_LDFLAGS) -rdynamic $(CM_ROOT)/runtime/lib/x64/igfxcmrt64.so -o $@
+	$(CXX) $^ $(HW_LDFLAGS) -rdynamic $(CM_ROOT)/runtime/lib/x64/libigfxcmrt.so -o $@
 $(HW_X64_APP_OBJS): %.o: %.cpp 
 	$(CXX) $< $(HW_CXXFLAGS) -o $@