Skip to content

Commit

Permalink
linear algebra refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
Jack Dermody committed Feb 27, 2024
1 parent 3bee8e5 commit 876a106
Show file tree
Hide file tree
Showing 11 changed files with 510 additions and 770 deletions.
72 changes: 36 additions & 36 deletions BrightData.Cuda/BrightData.Cuda.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

162 changes: 0 additions & 162 deletions BrightData.Cuda/CudaLinearAlgebraProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -788,168 +788,6 @@ public override float Sum(IReadOnlyNumericSegment<float> segment)
return Provider.SumValues(segment.GetDeviceMemoryPtr(), segment.Size);
}

/// <inheritdoc />
public override IMatrix AddMatrices(ITensor3D tensor)
{
var matrixSize = tensor.MatrixSize;
var tensorMemory = tensor.Segment.GetDeviceMemoryPtr();
var ret = (CudaTensorSegment)CreateSegment(matrixSize, true);
var retMemory = ret.DeviceMemory;
for (uint i = 0; i < tensor.Depth; i++) {
var ptrToMatrix = tensorMemory.Offset(i * matrixSize, matrixSize);
Provider.AddInPlace(retMemory, ptrToMatrix, matrixSize, 1f, 1f);
}
return new CudaMatrix(ret, tensor.RowCount, tensor.ColumnCount, this);
}

/// <inheritdoc />
public override ITensor3D AddPadding(ITensor3D tensor, uint padding)
{
var (ret, rows, cols) = Provider.TensorAddPadding(tensor.Segment.GetDeviceMemoryPtr(), tensor.RowCount, tensor.ColumnCount, tensor.Depth, 1, padding);
return new CudaTensor3D(CreateCudaTensorSegment(ret), tensor.Depth, rows, cols, this);
}

/// <inheritdoc />
public override void AddToEachRow(ITensor3D tensor, IVector vector)
{
for (uint i = 0, len = tensor.Depth; i < len; i++) {
using var matrix = tensor.GetMatrix(i);
matrix.AddToEachRow(vector.Segment);
}
}

/// <inheritdoc />
public override void AddToEachColumn(ITensor3D tensor, IVector vector)
{
for (uint i = 0, len = tensor.Depth; i < len; i++) {
using var matrix = tensor.GetMatrix(i);
matrix.AddToEachColumn(vector.Segment);
}
}

/// <inheritdoc />
public override IVector ColumnSums(ITensor4D tensor)
{
uint matrixSize = tensor.MatrixSize, tensorSize = tensor.TensorSize, depth = tensor.Depth, count = tensor.Count;
var ret = (CudaTensorSegment)CreateSegment(depth, true);
var tensorMemory = tensor.Segment.GetDeviceMemoryPtr();
var retMemory = ret.DeviceMemory;
using var singleBlock = Provider.Allocate(count * matrixSize, null, true);

for (uint i = 0; i < count; i++) {
var tensorPtr = tensorMemory.Offset(i * tensorSize, tensorSize);
var retPtr = singleBlock.Offset(i * matrixSize, matrixSize);
using var columnSums = Provider.SumColumns(tensorPtr, matrixSize, depth, retPtr);
Provider.AddInPlace(retMemory, columnSums, depth, 1f, 1f);
}
return new CudaVector(ret, this);
}

/// <inheritdoc />
public override IMatrix Im2Col(ITensor3D tensor, uint filterWidth, uint filterHeight, uint xStride, uint yStride)
{
var (ptr, rows, columns, _) = Provider.TensorIm2Col(tensor.Segment.GetDeviceMemoryPtr(), tensor.RowCount, tensor.ColumnCount, tensor.Depth, 1, filterWidth, filterHeight, xStride, yStride);
return new CudaMatrix(CreateCudaTensorSegment(ptr), rows, columns, this);
}

/// <inheritdoc />
public override (ITensor3D Result, ITensor3D? Indices) MaxPool(ITensor3D tensor, uint filterWidth, uint filterHeight, uint xStride, uint yStride, bool saveIndices)
{
var (ptr, indices, rows, cols) = Provider.TensorMaxPool(tensor.Segment.GetDeviceMemoryPtr(), tensor.RowCount, tensor.ColumnCount, tensor.Depth, 1, filterWidth, filterHeight, xStride, yStride, saveIndices);
var ret = new CudaTensor3D(CreateCudaTensorSegment(ptr), tensor.Depth, rows, cols, this);
var indexTensor = indices is null ? null : new CudaTensor3D(CreateCudaTensorSegment(indices), tensor.Depth, rows, cols, this);
return (ret, indexTensor);
}

/// <inheritdoc />
public override ITensor3D Multiply(ITensor3D tensor, IMatrix matrix)
{
var ptr = tensor.Segment.GetDeviceMemoryPtr();
uint rowsA = tensor.RowCount, columnsArowsB = tensor.ColumnCount, columnsB = matrix.ColumnCount;
float alpha = 1.0f, beta = 0.0f;
var outputPtr = Provider.Allocate(tensor.RowCount * columnsB * tensor.Depth);
var output = new CudaTensor3D(CreateCudaTensorSegment(outputPtr), tensor.Depth, tensor.RowCount, columnsB, this);

var status = CudaBlasNativeMethods.cublasSgemmStridedBatched(Provider.Blas,
Operation.NonTranspose,
Operation.NonTranspose,
(int)rowsA,
(int)columnsB,
(int)columnsArowsB,
ref alpha,
ptr.DevicePointer,
(int)rowsA,
tensor.MatrixSize,
matrix.Segment.GetDevicePointer(),
(int)columnsArowsB,
0,
ref beta,
outputPtr.DevicePointer,
(int)rowsA,
tensor.RowCount * columnsB,
(int)tensor.Depth
);
if (status != CuBlasStatus.Success)
throw new CudaBlasException(status);
return output;
}

/// <inheritdoc />
public override ITensor3D RemovePadding(ITensor3D tensor, uint padding)
{
var (ptr, rows, cols) = Provider.TensorRemovePadding(tensor.Segment.GetDeviceMemoryPtr(), tensor.RowCount, tensor.ColumnCount, tensor.Depth, 1, padding);
return new CudaTensor3D(CreateCudaTensorSegment(ptr), tensor.Depth, rows, cols, this);
}

/// <inheritdoc />
public override ITensor3D ReverseIm2Col(ITensor3D tensor, IMatrix filter, uint outputRows, uint outputColumns, uint outputDepth, uint filterWidth, uint filterHeight, uint xStride, uint yStride)
{
var (ptr, rows, cols, depth, _) = Provider.TensorReverseIm2Col(tensor.Segment.GetDeviceMemoryPtr(), filter.Segment.GetDeviceMemoryPtr(), tensor.RowCount, tensor.ColumnCount, tensor.Depth, 1, outputRows, outputColumns, outputDepth, filterWidth, filterHeight, xStride, yStride);
return new CudaTensor3D(CreateCudaTensorSegment(ptr), depth, rows, cols, this);
}

/// <inheritdoc />
public override ITensor3D ReverseMaxPool(ITensor3D tensor, ITensor3D indices, uint outputRows, uint outputColumns, uint filterWidth, uint filterHeight, uint xStride, uint yStride)
{
var ptr = Provider.TensorReverseMaxPool(tensor.Segment.GetDeviceMemoryPtr(), indices.Segment.GetDeviceMemoryPtr(), tensor.RowCount, tensor.ColumnCount, tensor.Depth, 1, outputRows, outputColumns, filterWidth, filterHeight, xStride, yStride);
return new CudaTensor3D(CreateCudaTensorSegment(ptr), tensor.Depth, outputRows, outputColumns, this);
}

/// <inheritdoc />
public override ITensor3D TransposeFirstAndMultiply(ITensor3D tensor, ITensor4D other)
{
var ptr = tensor.Segment.GetDeviceMemoryPtr();
var ptr2 = other.Segment.GetDeviceMemoryPtr();
uint rowsA = tensor.RowCount, columnsA = tensor.ColumnCount, columnsB = other.Depth, rowsB = other.RowCount * other.ColumnCount, blockSize2 = columnsB * rowsB;
float alpha = 1.0f, beta = 0.0f;
var outputPtr = Provider.Allocate(tensor.ColumnCount * columnsB * tensor.Depth);
var output = new CudaTensor3D(CreateCudaTensorSegment(outputPtr), tensor.Depth, columnsB, tensor.ColumnCount, this);

var status = CudaBlasNativeMethods.cublasSgemmStridedBatched(Provider.Blas,
Operation.Transpose,
Operation.NonTranspose,
(int)columnsA,
(int)columnsB,
(int)rowsB,
ref alpha,
ptr.DevicePointer,
(int)rowsA,
tensor.MatrixSize,
ptr2.DevicePointer,
(int)rowsB,
blockSize2,
ref beta,
outputPtr.DevicePointer,
(int)columnsA,
tensor.ColumnCount * columnsB,
(int)tensor.Depth
);
if (status != CuBlasStatus.Success)
throw new CudaBlasException(status);

return output;
}

/// <inheritdoc />
public override INumericSegment<float>[] MultiSoftmax(ArraySegment<IReadOnlyNumericSegment<float>> segments)
{
Expand Down

0 comments on commit 876a106

Please sign in to comment.