yet another gelu #293

ngc92 · 2024-04-29T18:53:51Z

more complicated Packet128 for cleaner kernels

ngc92 · 2024-04-29T19:14:39Z

This is how it would look like if we moved all the casting into the load/store functions:

template<class ElementType>
struct alignas(16) Packed128 {
    __device__ ElementType& operator[](int index) {
        return payload[index];
    }
    __device__ const ElementType& operator[](int index) const {
        return payload[index];
    }
    __device__ float fp32(int index) {
        return static_cast<float>(payload[index]);
    }
    static constexpr const size_t size = sizeof(int4) / sizeof(ElementType);

    ElementType payload[size];
};

// use this function to load a Packet128 from an aligned memory address
template<class ElementType, ELoadMode Mode=ELoadMode::CA>
__device__ __forceinline__ Packed128<ElementType> load_aligned(const ElementType* address, load_mode_t<Mode> mode = {}) {
    int4 bits = generic_load(reinterpret_cast<const int4*>(address), mode);
    Packed128<ElementType> result;
    static_assert(sizeof(bits) == sizeof(result), "Size mismatch.");
    memcpy(&result, &bits, sizeof(bits));
    return result;
}

// use this function to store a Packet128 to an aligned memory address
template<class ElementType, EStoreMode Mode=EStoreMode::WB>
__device__ void store_aligned(ElementType* target, Packed128<ElementType> value, store_mode_t<Mode> mode = {}) {
    int4 bits;
    static_assert(sizeof(bits) == sizeof(value), "Size mismatch.");
    memcpy(&bits, &value, sizeof(bits));
    generic_store(reinterpret_cast<int4*>(target), bits, mode);
}

karpathy · 2024-04-29T22:25:39Z

dev/cuda/gelu_forward.cu

+            float cube = 0.044715f * xi * xi * xi;
+            packet_out[k] = 0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube)));
+        }
+        store_aligned(out + i, packet_out);


any reason we loadcs but store without cs?

for this kernel, we read exactly once and store once, so these hints don't gain us anything. But by not keeping the input in cache, but the output there, maybe the next kernel can be a bit faster. This is just guesswork though, I haven't actually measured this.

got it! that makes sense actually

ngc92 · 2024-04-29T22:44:45Z

close by #298

yet another gelu

45252d4

ngc92 added 3 commits April 29, 2024 22:19

slight simplification

896b644

simplify to only support cs

9f07a17

comments

4872c57

karpathy reviewed Apr 29, 2024

View reviewed changes

ngc92 closed this Apr 29, 2024

ngc92 deleted the yet-another-gelu branch May 19, 2024 08:39

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

yet another gelu #293

yet another gelu #293

ngc92 commented Apr 29, 2024

ngc92 commented Apr 29, 2024 •

edited

karpathy Apr 29, 2024

ngc92 Apr 29, 2024

karpathy Apr 29, 2024

ngc92 commented Apr 29, 2024

yet another gelu #293

yet another gelu #293

Conversation

ngc92 commented Apr 29, 2024

ngc92 commented Apr 29, 2024 • edited

karpathy Apr 29, 2024

Choose a reason for hiding this comment

ngc92 Apr 29, 2024

Choose a reason for hiding this comment

karpathy Apr 29, 2024

Choose a reason for hiding this comment

ngc92 commented Apr 29, 2024

ngc92 commented Apr 29, 2024 •

edited