Skip to content

Commit

Permalink
update the scan and the copy examples
Browse files Browse the repository at this point in the history
Change-Id: Idd886e2bfbd0b2df7997d6be3b2eba46c59f1941
  • Loading branch information
gangche1 committed Jan 8, 2020
1 parent 06b0d73 commit 2c05d2b
Show file tree
Hide file tree
Showing 32 changed files with 2,107 additions and 681 deletions.
265 changes: 228 additions & 37 deletions test/external_contribution/CopyPipelineHigh6/CPipeline.cpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -20,43 +20,38 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "cm_rt.h"
#include "Rgb2Encode.h"
#include "ErrorDiffusion.h"

#define BLOCK_WIDTH 8
#define BLOCK_HEIGHT 8
#define BLOCK_HEIGHT 4

Rgb2Encode::Rgb2Encode(CmDevice *pdevice)
ErrorDiffusion::ErrorDiffusion(CmDevice *pdevice)
{
m_pCmDev = pdevice;
m_pKernel = NULL;

float coefficients[9] = {0.299, 0.587, 0.114, -0.168736, -0.331264, 0.5, 0.5, -0.418688, -0.081312};
std::copy(coefficients, coefficients + 9, m_coeffs);

float offsets[3] = {0.0, 128.0, 128.0};
std::copy(offsets, offsets + 3, m_offsets);
}

char * Rgb2Encode::GetKernelName(const int yuvFormat)
ErrorDiffusion::~ErrorDiffusion(void)
{
if (yuvFormat == 0) // YUV444
return "Rgb2YCbCr_GENX";
else if (yuvFormat == 1) // YUV400 or grayscale
return "Rgb2Y8_GENX";
free(m_pErrRowBuf);
free(m_pErrColBuf);
}

int Rgb2Encode::PreRun(
int ErrorDiffusion::PreRun(
CmKernel *pKernel,
SurfaceIndex *pSI_SrcSurfR,
SurfaceIndex *pSI_SrcSurfG,
SurfaceIndex *pSI_SrcSurfB,
SurfaceIndex *pSI_DstSurf,
SurfaceIndex *pSI_SrcSurfCMYK,
SurfaceIndex *pSI_DstSurfCMYK,
int nPicWidth,
int nPicHeight
)
{
int result;

CmThreadSpace *pTS = NULL;
CmBuffer *pErrRowBuf = NULL;
CmBuffer *pErrColBuf = NULL;
SurfaceIndex *pSI_ErrRowBuf = NULL;
SurfaceIndex *pSI_ErrColBuf = NULL;

int nPicWidthInBlk, nPicHeightInBlk;
int nKernelInput;
Expand All @@ -66,16 +61,33 @@ int Rgb2Encode::PreRun(
nPicWidthInBlk = (nPicWidth + BLOCK_WIDTH - 1) / BLOCK_WIDTH;
nPicHeightInBlk = (nPicHeight + BLOCK_HEIGHT - 1) / BLOCK_HEIGHT;

CM_Error_Handle(m_pCmDev->CreateThreadSpace(nPicWidthInBlk, nPicHeightInBlk, pTS), "CreateThreadSpace Error");
CM_Error_Handle(m_pCmDev->CreateThreadSpace((nPicWidthInBlk+1), nPicHeightInBlk, pTS), "CreateThreadGroupSpace Error");

CM_Error_Handle(pTS->SelectThreadDependencyPattern(CM_WAVEFRONT26), "Select thread dependency error");

int rowErrBufferSize = (nPicWidthInBlk + 1) * 64;
uchar *pRowErr = (uchar *) CM_ALIGNED_MALLOC(rowErrBufferSize, 0x1000);
memset(pRowErr, 0, rowErrBufferSize);

int colErrBufferSize = nPicHeightInBlk * 128;
uchar *pColErr = (uchar *) CM_ALIGNED_MALLOC(colErrBufferSize, 0x1000);
memset(pColErr, 0, colErrBufferSize);

CM_Error_Handle(m_pCmDev->CreateBuffer(rowErrBufferSize, pErrRowBuf),
"CreateBuffer Error");
pErrRowBuf->GetIndex(pSI_ErrRowBuf);

CM_Error_Handle(m_pCmDev->CreateBuffer(colErrBufferSize, pErrColBuf),
"CreateBuffer Error");
pErrColBuf->GetIndex(pSI_ErrColBuf);

// Set up kernel args for Pipeline filter
nKernelInput = 0;
result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_SrcSurfR);
result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_SrcSurfG);
result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_SrcSurfB);
result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_DstSurf);
result = m_pKernel->SetKernelArg(nKernelInput++, 9*sizeof(float), &m_coeffs[0]);
result = m_pKernel->SetKernelArg(nKernelInput++, 3*sizeof(float), &m_offsets[0]);
result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_SrcSurfCMYK);
result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_ErrRowBuf);
result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_ErrColBuf);
result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(SurfaceIndex), pSI_DstSurfCMYK);
result = m_pKernel->SetKernelArg(nKernelInput++, sizeof(int), &nPicWidthInBlk );
result = m_pKernel->AssociateThreadSpace(pTS);

CM_Error_Handle(result, "SetKernelArg Error");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <cm/cm.h>

#define BLKW 8
#define BLKH 4
#define ITERATION 4

#define X 0
#define Y 1

/*
* Error diffusion push error to neighbour pixels, and result the algorithm need
* to run in sequence. Using following pixel processing order allow parallelism
* but not violate the dependency issue.
* Using pixelRowIndex and pixelColIndex to determine the pixel sequence to
* process the error diffusion
*
* Some pixels can process together, but some can't. To reduce the conditional
* checking, if the pixel can't parallel with other pixel, repeat to process the
* same pixel. For SIMD, it isn't any performance penalty
*/

short pixelLocX[32] ={ 0,1,2,3,4,5,6,7,-2,-1,0,1,2,3,4,5,
-4,-3,-2,-1,0,1,2,3,-6,-5,-4,-3,-2,-1,0,1
};
short pixelLocY[32] ={ 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3
};
uchar weight[4] = {1,5,3,7};

extern "C" _GENX_MAIN_ void
ErrorDiffusion_GENX(
SurfaceIndex SrcSI,
SurfaceIndex ErrBufRowSI,
SurfaceIndex ErrBufColSI,
SurfaceIndex DstSI,
int widthBlk
)
{
matrix<uint, BLKH, 8> input;
matrix<uint, BLKH, 8> output;
matrix<short, 5, 64> errBuf = 0;
vector<short, 64> errRowTemp;
vector<short, 2> pos, pos4;
vector<uint, 32> inputVectorX(pixelLocX);
vector<uint, 32> inputVectorY(pixelLocY);
matrix<uint, 4, 16> tmpInput;

cm_wait();

pos(X) = get_thread_origin_x();
pos(Y) = get_thread_origin_y();

pos4(X) = pos(X) * BLKW;
pos4(Y) = pos(Y) * BLKH;

inputVectorX += pos4(X);
inputVectorY += pos4(Y);

read_typed(SrcSI, CM_R_ENABLE, tmpInput.select<1,1,16,1>(0,0),
inputVectorX.select<16,1>(0), inputVectorY.select<16,1>(0));
input.select<1,1,8,1>(0,0) = tmpInput.select<1,1,8,1>(0,0);
input.select<1,1,8,1>(1,0) = tmpInput.select<1,1,8,1>(0,8);

read_typed(SrcSI, CM_R_ENABLE, tmpInput.select<1,1,16,1>(0,0),
inputVectorX.select<16,1>(16), inputVectorY.select<16,1>(16));
input.select<1,1,8,1>(2,0) = tmpInput.select<1,1,8,1>(0,0);
input.select<1,1,8,1>(3,0) = tmpInput.select<1,1,8,1>(0,8);

// Need to update, try to use single column for column error buffer and
// single row for row buffer (single row for row buffer may not possible)
int colErrBufOffset = (pos(Y))*128;
int rowErrBufOffset = (pos(X)+1)*64-24;


if (pos(Y) != 0)
read(DWALIGNED(ErrBufRowSI), rowErrBufOffset, errRowTemp.select<40,1>(0));
else
errRowTemp.select<40,1>(0) = 0;

errBuf.select<1,1,40,1>(0,24) = errRowTemp.select<40,1>(0);

if (pos(X) != 0)
read(ErrBufColSI, colErrBufOffset , errRowTemp.select<40,1>(0));
else
errRowTemp.select<40,1>(0) = 0;

errBuf.select<1,1,12,1>(1,16) = errRowTemp.select<12,1>(0);
errBuf.select<1,1,12,1>(2,8) = errRowTemp.select<12,1>(12);
errBuf.select<1,1,12,1>(3,0) = errRowTemp.select<12,1>(24);
errBuf.select<1,1,4,1>(4,0) = errRowTemp.select<4,1>(36);

// The read extra one column to left and right, so, it will directly map into
// error buffer
//
// The input buffer is a diaganol box, but we read a square to wrap this box

// The read extra one column to left and right, so, it will directly map into
// error buffer
// The input buffer is a diaganol box, but we read a square to wrap this box
// ________
// _| _|
// _| _|
// _| _|
//|_______ |
//
vector<uint, 8> oldpixel, newpixel;

vector_ref<uchar, 32> oldpixelc = oldpixel.format<uchar>();
vector_ref<uchar, 32> newpixelc = newpixel.format<uchar>();
vector<float, 32> fxs;

vector<float, 4> weightf(weight);

weightf =weightf/16.0f;

// this is error buffer size matrix<short, 5, 64> errBuf = 0;
int colIndex, rowIndex;
#pragma unroll
for (int i=0; i < ITERATION; i++)
{
fxs = 0;
// oldpixel should be from input one row
oldpixel = input.select<1,1,8,1>(i,0);

colIndex = 24 - i*8;
rowIndex = i;
if (pos(X) == widthBlk)
errBuf.select<5,1,36,1>(0, 28) = 0;

fxs += errBuf.select<1,1,32,1>(rowIndex,colIndex) * weightf[0];

fxs += errBuf.select<1,1,32,1>(rowIndex,colIndex+4) * weightf[1];

fxs += errBuf.select<1,1,32,1>(rowIndex,colIndex+8) * weightf[2];

if (pos(X) == 0)
{
if (i == 1)
fxs.select<8,1>(0) = 0;
else if (i == 2)
fxs.select<16,1>(0) = 0;
else if (i == 3)
fxs.select<24,1>(0) = 0;
}

rowIndex += 1;

vector<float, 4> pixelleft (errBuf.select<1,1,4,1>(rowIndex, colIndex));
vector<float, 4> singlefxs;
int rowi;

#pragma unroll
for (int ii=0; ii < 8; ii++)
{
rowi = ii*4;

singlefxs = pixelleft * weightf[3];

singlefxs = singlefxs + fxs.select<4,1>(rowi) +
oldpixelc.select<4,1>(rowi);

newpixelc.select<4,1>(rowi) = (singlefxs > 127.0f)* 0xff;
fxs.select<4,1>(rowi) = cm_rndd<float>(singlefxs - newpixelc.select<4,1>(rowi));
pixelleft = fxs.select<4,1>(rowi);

}

// Temporary workaround the right border incorrect data
if (pos(X) == widthBlk)
{
if (i == 1)
fxs.select<24,1>(8) = 0;
else if (i == 2)
fxs.select<16,1>(16) = 0;
else if (i == 3)
fxs.select<8,1>(24) = 0;
}
errBuf.select<1,1,32,1>(rowIndex,colIndex+4) = fxs;

output.select<1,1,8,1>(rowIndex-1, 0) = newpixel;

}

//output write need to use scatter write as it index is
matrix<uint, 4, 16> tmpOut;

tmpOut.select<1,1,8,1>(0,0) = output.select<1,1,8,1>(0,0);
tmpOut.select<1,1,8,1>(0,8) = output.select<1,1,8,1>(1,0);

write_typed(DstSI, CM_R_ENABLE, tmpOut,
inputVectorX.select<16,1>(0), inputVectorY.select<16,1>(0));

tmpOut.select<1,1,8,1>(0,0) = output.select<1,1,8,1>(2,0);
tmpOut.select<1,1,8,1>(0,8) = output.select<1,1,8,1>(3,0);
write_typed(DstSI, CM_R_ENABLE, tmpOut,
inputVectorX.select<16,1>(16), inputVectorY.select<16,1>(16));

vector<short, 64> errBufColOut;

errBufColOut.select<12,1>(0) = errBuf.select<1,1,12,1>(1,48);
errBufColOut.select<12,1>(12) = errBuf.select<1,1,12,1>(2,40);
errBufColOut.select<12,1>(24) = errBuf.select<1,1,12,1>(3,32);
errBufColOut.select<4,1>(36) = errBuf.select<1,1,4,1>(4,32);

colErrBufOffset = (pos(Y))*128;
rowErrBufOffset = (pos(X))*64;

write(ErrBufRowSI, rowErrBufOffset, errBuf.select<1,1,32,1>(4,4).format<short>());
write(ErrBufColSI, colErrBufOffset, errBufColOut.select<64,1>(0));

cm_fence();
cm_signal();
}
6 changes: 3 additions & 3 deletions test/external_contribution/CopyPipelineHigh6/Makefile.linux
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
###########################################################################

#macro CM_ROOT and APP_NAME are passed as Make parameters
CM_ROOT := /home/gangche1/Linux_C_for_Media_Development_Package_20181022
#CM_ROOT := /home/gangche1/Linux_C_for_Media_Development_Package_20181022
#CMC := ../../../build.64.linux/bin/cmc
#CM_ROOT := ../..
CMC := $(CM_ROOT)/compiler/bin/cmc
Expand All @@ -20,7 +20,7 @@ HOST_FILE = $(filter-out $(KERNEL_FILE), $(ALL_FILE))
APP := $(APP_NAME)

CXX := g++
INCL := -I$(CM_ROOT)/runtime/include -I.. -Iinclude
INCL := -I$(CM_ROOT)/runtime/include -I.. -Iinclude -I$(CM_ROOT)/examples
CXXFLAGS := -w -g ${INCL} -msse4.1 -D__LINUX__ -DLINUX -O0 -std=gnu++11 -fPIC -c -DCM_$(subst gen,GEN,$(GEN_MODE)) -rdynamic -ffloat-store -D_DEBUG

#.PHONY: clean, hw_x64, all
Expand All @@ -42,7 +42,7 @@ HW_X64_APP_OBJS := $(HW_APP_SOURCES:.cpp=.o)

hw_x64: $(HW_X64_APP) $(HW_X64_ISA)
$(HW_X64_APP): $(HW_X64_APP_OBJS)
$(CXX) $^ $(HW_LDFLAGS) -rdynamic $(CM_ROOT)/runtime/lib/x64/igfxcmrt64.so -o $@
$(CXX) $^ $(HW_LDFLAGS) -rdynamic $(CM_ROOT)/runtime/lib/x64/libigfxcmrt.so -o $@
$(HW_X64_APP_OBJS): %.o: %.cpp
$(CXX) $< $(HW_CXXFLAGS) -o $@

Expand Down
Loading

0 comments on commit 2c05d2b

Please sign in to comment.