Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
Jack Dermody committed Feb 24, 2024
1 parent f758203 commit 6285de3
Show file tree
Hide file tree
Showing 84 changed files with 6,070 additions and 5,204 deletions.
11 changes: 6 additions & 5 deletions BrightData.Cuda/BrightData.Cuda.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

370 changes: 173 additions & 197 deletions BrightData.Cuda/CudaLinearAlgebraProvider.cs

Large diffs are not rendered by default.

29 changes: 27 additions & 2 deletions BrightData.Cuda/CudaMatrix.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using BrightData.LinearAlgebra;
using BrightData.Cuda.CudaToolkit.Types;
using BrightData.Cuda.CudaToolkit;
using BrightData.LinearAlgebra;

namespace BrightData.Cuda
{
Expand All @@ -13,7 +15,7 @@ public override IVector GetColumnVector(uint index)
{
var segment = (CudaTensorSegment)Segment;
var ptr = segment.DeviceMemory.Offset(index * RowCount, RowCount);
return Lap.CreateVector(new CudaTensorSegment(ptr));
return Lap.CreateVector(new CudaTensorSegment(ptr, lap.Provider));
}

/// <inheritdoc />
Expand All @@ -23,5 +25,28 @@ public override IVector GetRowVector(uint index)
var segment = Lap.GetNonContinuousSegment(Segment, index, RowCount, ColumnCount);
return Lap.CreateVector(segment);
}

/// <inheritdoc />
public override unsafe IMatrix Transpose()
{
var provider = lap.Provider;
var ret = provider.Allocate(RowCount * ColumnCount);
float alpha = 1.0f, beta = 0.0f;
CudaBlasNativeMethods.cublasSgeam(provider.Blas,
Operation.Transpose,
Operation.NonTranspose,
(int)ColumnCount,
(int)RowCount,
ref alpha,
Segment.GetDevicePointer(),
(int)RowCount,
ref beta,
new CuDevicePtr(0),
(int)ColumnCount,
ret.DevicePointer,
(int)ColumnCount
);
return lap.CreateMatrix(ColumnCount, RowCount, new CudaTensorSegment(ret, lap.Provider));
}
}
}
7 changes: 7 additions & 0 deletions BrightData.Cuda/CudaProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ readonly CuFunction
_leakyRelu,
_leakyReluDerivative,
_memSet,
_memCpy,
_sumColumns,
_pointwiseDivide,
_sqrt,
Expand Down Expand Up @@ -211,6 +212,7 @@ public CudaProvider(BrightDataContext context, string? cudaKernelPath, string? c
_relu = _kernel.LoadFunction("RELU");
_reluDerivative = _kernel.LoadFunction("RELUDerivative");
_memSet = _kernel.LoadFunction("MemSet");
_memCpy = _kernel.LoadFunction("MemCpy");
_sumColumns = _kernel.LoadFunction("SumColumns");
_pointwiseDivide = _kernel.LoadFunction("PointwiseDivide");
_sqrt = _kernel.LoadFunction("Sqrt");
Expand Down Expand Up @@ -530,6 +532,11 @@ internal void MemSet(IDeviceMemoryPtr data, float value, uint count, CuStream* s
Invoke(_memSet, stream, count, data.DevicePointer, value, count, offset, increment);
}

internal void MemCpy(IDeviceMemoryPtr a, IDeviceMemoryPtr b, uint count, CuStream* stream = null, uint offsetA = 0, uint offsetB = 0, uint incrementA = 1, uint incrementB = 1)
{
Invoke(_memCpy, stream, count, a.DevicePointer, b.DevicePointer, count, offsetA, offsetB, incrementA, incrementB);
}

internal IDeviceMemoryPtr Sqrt(IDeviceMemoryPtr a, uint size, float valueAdjustment, uint ai = 1, uint bi = 1, CuStream* stream = null)
{
var ret = Allocate(size, stream);
Expand Down
30 changes: 23 additions & 7 deletions BrightData.Cuda/CudaTensorSegment.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using BrightData.Cuda.CudaToolkit;
using BrightData.LinearAlgebra.Segments;
using CommunityToolkit.HighPerformance.Buffers;

namespace BrightData.Cuda
{
internal class CudaTensorSegment(IDeviceMemoryPtr data) : INumericSegment<float>
internal class CudaTensorSegment(IDeviceMemoryPtr data, CudaProvider provider) : INumericSegment<float>
{
const string CudaSegmentType = "cuda";

Expand Down Expand Up @@ -78,18 +80,32 @@ public void CopyFrom(ReadOnlySpan<float> span, uint targetOffset)
DeviceMemory.CopyToDevice(span, targetOffset);
}

public void CopyTo(INumericSegment<float> segment, uint sourceOffset, uint targetOffset)
public unsafe void CopyTo(INumericSegment<float> segment, uint sourceOffset, uint targetOffset)
{
if (segment.SegmentType == CudaSegmentType) {
var other = (CudaTensorSegment)segment;
var target = targetOffset == 0 ? other.DeviceMemory : other.DeviceMemory.Offset(targetOffset, other.DeviceMemory.Size - targetOffset);
var source = sourceOffset == 0 ? DeviceMemory : DeviceMemory.Offset(sourceOffset, DeviceMemory.Size - sourceOffset);
target.CopyToDevice(source);
if (segment.SegmentType == CudaSegmentType)
Copy(DeviceMemory, (CudaTensorSegment)segment, sourceOffset, targetOffset);
else if (segment is MutableTensorSegmentWrapper<float> { UnderlyingSegment.SegmentType: CudaSegmentType } wrapper) {
if(wrapper.Stride == 1)
Copy(DeviceMemory, (CudaTensorSegment)wrapper.UnderlyingSegment, sourceOffset, targetOffset + wrapper.Offset);
else {
if (sourceOffset != 0 || targetOffset != 0)
throw new Exception("Not supported");
var other = (CudaTensorSegment)wrapper.UnderlyingSegment;
provider.MemCpy(other.DeviceMemory, DeviceMemory, wrapper.Size, null, wrapper.Offset, 0, wrapper.Stride, 1);
}
}
else {
using var buffer = ToNewMemoryOwner();
segment.CopyFrom(buffer.Span[(int)sourceOffset..], targetOffset);
}
return;

static void Copy(IDeviceMemoryPtr sourceSegment, CudaTensorSegment targetSegment, uint sourceOffset, uint targetOffset)
{
var target = targetOffset == 0 ? targetSegment.DeviceMemory : targetSegment.DeviceMemory.Offset(targetOffset, targetSegment.DeviceMemory.Size - targetOffset);
var source = sourceOffset == 0 ? sourceSegment : sourceSegment.Offset(sourceOffset, sourceSegment.Size - sourceOffset);
target.CopyToDevice(source);
}
}

public void CopyTo(Span<float> destination)
Expand Down
2 changes: 1 addition & 1 deletion BrightData.Cuda/CudaVector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ namespace BrightData.Cuda
{
internal class CudaVector(INumericSegment<float> data, CudaLinearAlgebraProvider lap) : MutableVector<CudaLinearAlgebraProvider>(data, lap), IHaveDeviceMemory
{
public IDeviceMemoryPtr Memory => CudaLinearAlgebraProvider.GetDeviceMemoryPtr(Segment);
public IDeviceMemoryPtr Memory => Segment.GetDeviceMemoryPtr();
}
}
13 changes: 13 additions & 0 deletions BrightData.Cuda/ExtensionMethods.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Linq;
using System.Reflection;
using BrightData.Cuda.CudaToolkit;
using BrightData.Cuda.CudaToolkit.Types;
using BrightData.LinearAlgebra;

namespace BrightData.Cuda
Expand Down Expand Up @@ -82,5 +83,17 @@ internal static void CheckResult(this CuBlasStatus result)
if (result != CuBlasStatus.Success)
throw new CudaBlasException(result);
}

internal static CudaDeviceVariable<float> GetDeviceVariable(this INumericSegment<float> segment) => GetDeviceMemoryPtr(segment).DeviceVariable;
internal static IDeviceMemoryPtr GetDeviceMemoryPtr(this IReadOnlyNumericSegment<float> segment)
{
if (segment is not CudaTensorSegment cudaSegment)
throw new Exception("CUDA tensors can only be used with other CUDA tensors");
if (!segment.IsValid)
throw new Exception("CUDA tensor was not valid");
return cudaSegment.DeviceMemory;
}

internal static CuDevicePtr GetDevicePointer(this IReadOnlyNumericSegment<float> segment) => GetDeviceMemoryPtr(segment).DevicePointer;
}
}
65 changes: 36 additions & 29 deletions BrightData.Cuda/Helper/CudaTensorDataCache.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using BrightData.Cuda.CudaToolkit.Types;
using BrightData.DataTable;
using BrightData.DataTable.Columns;
using BrightData.LinearAlgebra.ReadOnly;
using CommunityToolkit.HighPerformance;

namespace BrightData.Cuda.Helper
{
Expand All @@ -15,17 +17,10 @@ public class CudaTensorDataCache : IDisposable
readonly CudaLinearAlgebraProvider _lap;
readonly IDeviceMemoryPtr _data;

/// <summary>
/// Constructor
/// </summary>
/// <param name="lap"></param>
/// <param name="table"></param>
public CudaTensorDataCache(CudaLinearAlgebraProvider lap, IDataTable table)
CudaTensorDataCache(CudaLinearAlgebraProvider lap, IDataTable table, ReadOnlyMemory<float> tensorData)
{
lap.BindThread();

// copy entire tensor block into CUDA device
var tensorData = table.GetTensorData();
lap.BindThread();
_data = new DeviceMemoryBlock(null, new CudaDeviceVariable<float>(tensorData.Length));
_data.CopyToDevice(tensorData.Span, 0);
_lap = lap;
Expand All @@ -34,52 +29,64 @@ public CudaTensorDataCache(CudaLinearAlgebraProvider lap, IDataTable table)
table.SetTensorMappers(GetVectors, GetMatrices, Get3DTensors, Get4DTensors);
}

ReadOnlyMemory<ReadOnlyVector> GetVectors(ReadOnlySpan<DataRangeColumnType> span)
/// <summary>
/// Creates a CUDA tensor cache
/// </summary>
/// <param name="lap"></param>
/// <param name="table"></param>
/// <returns></returns>
public static async Task<CudaTensorDataCache> Create(CudaLinearAlgebraProvider lap, IDataTable table)
{
var tensorData = await table.GetTensorData();
return new(lap, table, tensorData);
}

Task<ReadOnlyMemory<ReadOnlyVector>> GetVectors(ReadOnlyMemory<DataRangeColumnType> block)
{
var index = 0;
var ret = new ReadOnlyVector[span.Length];
foreach (ref readonly var item in span) {
var ret = new ReadOnlyVector[block.Length];
foreach (ref readonly var item in block.Span) {
var devicePtr = _data.Offset(item.StartIndex, item.Size);
var segment = new CudaTensorSegment(devicePtr);
var segment = new CudaTensorSegment(devicePtr, _lap.Provider);
ret[index++] = new(segment);
}
return ret;
return Task.FromResult(new ReadOnlyMemory<ReadOnlyVector>(ret));
}

ReadOnlyMemory<ReadOnlyMatrix> GetMatrices(ReadOnlySpan<MatrixColumnType> span)
Task<ReadOnlyMemory<ReadOnlyMatrix>> GetMatrices(ReadOnlyMemory<MatrixColumnType> block)
{
var index = 0;
var ret = new ReadOnlyMatrix[span.Length];
foreach (ref readonly var item in span) {
var ret = new ReadOnlyMatrix[block.Length];
foreach (ref readonly var item in block.Span) {
var devicePtr = _data.Offset(item.StartIndex, item.Size);
var segment = new CudaTensorSegment(devicePtr);
var segment = new CudaTensorSegment(devicePtr, _lap.Provider);
ret[index++] = new(segment, item.RowCount, item.ColumnCount);
}
return ret;
return Task.FromResult(new ReadOnlyMemory<ReadOnlyMatrix>(ret));
}

ReadOnlyMemory<ReadOnlyTensor3D> Get3DTensors(ReadOnlySpan<Tensor3DColumnType> span)
Task<ReadOnlyMemory<ReadOnlyTensor3D>> Get3DTensors(ReadOnlyMemory<Tensor3DColumnType> block)
{
var index = 0;
var ret = new ReadOnlyTensor3D[span.Length];
foreach (ref readonly var item in span) {
var ret = new ReadOnlyTensor3D[block.Length];
foreach (ref readonly var item in block.Span) {
var devicePtr = _data.Offset(item.StartIndex, item.Size);
var segment = new CudaTensorSegment(devicePtr);
var segment = new CudaTensorSegment(devicePtr, _lap.Provider);
ret[index++] = new(segment, item.Depth, item.RowCount, item.ColumnCount);
}
return ret;
return Task.FromResult(new ReadOnlyMemory<ReadOnlyTensor3D>(ret));
}

ReadOnlyMemory<ReadOnlyTensor4D> Get4DTensors(ReadOnlySpan<Tensor4DColumnType> span)
Task<ReadOnlyMemory<ReadOnlyTensor4D>> Get4DTensors(ReadOnlyMemory<Tensor4DColumnType> block)
{
var index = 0;
var ret = new ReadOnlyTensor4D[span.Length];
foreach (ref readonly var item in span) {
var ret = new ReadOnlyTensor4D[block.Length];
foreach (ref readonly var item in block.Span) {
var devicePtr = _data.Offset(item.StartIndex, item.Size);
var segment = new CudaTensorSegment(devicePtr);
var segment = new CudaTensorSegment(devicePtr, _lap.Provider);
ret[index++] = new(segment, item.Count, item.Depth, item.RowCount, item.ColumnCount);
}
return ret;
return Task.FromResult(new ReadOnlyMemory<ReadOnlyTensor4D>(ret));
}

/// <inheritdoc />
Expand Down
2 changes: 1 addition & 1 deletion BrightData.Cuda/Helper/DeviceMemoryBlockBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ public virtual void CopyToDevice(IDeviceMemoryPtr source)
/// <inheritdoc />
public virtual void CopyToDevice(ReadOnlySpan<float> span, uint offsetSource = 0)
{
fixed (float* p = &MemoryMarshal.GetReference(span))
fixed (float* p = span)
{
var ptr = p + offsetSource * sizeof(float);
DriverApiNativeMethods.SynchronousMemcpyV2.cuMemcpyHtoD_v2(DevicePointer, (IntPtr)ptr, Math.Min(Size, span.Length) * sizeof(float)).CheckResult();
Expand Down
2 changes: 1 addition & 1 deletion BrightData.Cuda/Helper/PtrToMemory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public void CopyToDevice(IDeviceMemoryPtr source)
}
public unsafe void CopyToDevice(ReadOnlySpan<float> span, uint offsetSource)
{
fixed (float* p = &MemoryMarshal.GetReference(span))
fixed (float* p = span)
{
DeviceVariable.CopyToDevice((IntPtr)p, offsetSource, 0, (int)Size * sizeof(float));
}
Expand Down
2 changes: 1 addition & 1 deletion BrightData.Cuda/Helper/StreamDeviceMemoryBlock.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public override void CopyToDevice(IDeviceMemoryPtr source)

public override void CopyToDevice(ReadOnlySpan<float> span, uint targetOffset = 0)
{
fixed (float* ptr = &MemoryMarshal.GetReference(span))
fixed (float* ptr = span)
{
DriverApiNativeMethods.AsynchronousMemcpyV2.cuMemcpyHtoDAsync_v2(DevicePointer + targetOffset * sizeof(float), (IntPtr)ptr, Size * sizeof(float), stream).CheckResult();
}
Expand Down
7 changes: 7 additions & 0 deletions BrightData.Cuda/cuda/brightwire.cu
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,13 @@ extern "C"
}
}

__global__ void MemCpy(float* __restrict a, float* __restrict b, uint count, uint offsetA, uint offsetB, uint ai, uint bi)
{
for (uint index = blockDim.x * blockIdx.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) {
a[offsetA + (index * ai)] = b[offsetB + (index * bi)];
}
}

__global__ void FindMinAndMax(const float* __restrict a, uint count, float* __restrict minBlock, float* __restrict maxBlock, uint ai)
{
uint tidX = threadIdx.x;
Expand Down

0 comments on commit 6285de3

Please sign in to comment.