In [1]:
%%writefile Rope.cu

Writing Rope.cu


In [8]:
%%writefile /content/Rope.cu
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h> // Include stdio.h for printf

__device__ void apply_rotary_embedding(
    float* q,           // query vectors
    float* k,           // key vectors
    const int head_dim, // dimension of each head
    const int position, // absolute position in sequence
    const float base = 10000.0f
) {
    // Process pairs of elements (real, imaginary)
    for (int i = 0; i < head_dim; i += 2) {
        float freq = 1.0f / powf(base, (float)(i) / head_dim);
        float theta = position * freq;

        // Calculate rotation matrix elements
        float cos_theta = cosf(theta);
        float sin_theta = sinf(theta);

        // Cache original values
        float q_real = q[i];
        float q_img = q[i + 1];
        float k_real = k[i];
        float k_img = k[i + 1];

        // Apply rotation to query
        q[i] = q_real * cos_theta - q_img * sin_theta;
        q[i + 1] = q_real * sin_theta + q_img * cos_theta;

        // Apply rotation to key
        k[i] = k_real * sin_theta + k_img * cos_theta;
        k[i + 1] = k_real * cos_theta - k_img * sin_theta;

        // Add printf statements for debugging
        printf("Position: %d, i: %d, q[i]: %f, q[i+1]: %f, k[i]: %f, k[i+1]: %f\n", position, i, q[i], q[i+1], k[i], k[i+1]);
    }
}

__global__ void rope_kernel(
    float* queries,        // [batch_size, seq_len, num_heads, head_dim]
    float* keys,          // [batch_size, seq_len, num_heads, head_dim]
    const int batch_size,
    const int seq_len,
    const int num_heads,
    const int head_dim
) {
    // Calculate global position
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Add printf statements for debugging
    printf("Thread index: %d\n", idx);

    // Calculate batch, sequence position, and head indices
    int batch_idx = idx / (seq_len * num_heads);
    int seq_idx = (idx / num_heads) % seq_len;
    int head_idx = idx % num_heads;

    if (batch_idx >= batch_size) return;

    // Calculate base pointer offsets
    int base_idx = batch_idx * (seq_len * num_heads * head_dim) +
                   seq_idx * (num_heads * head_dim) +
                   head_idx * head_dim;

    // Apply rotary embedding to this position
    apply_rotary_embedding(
        &queries[base_idx],
        &keys[base_idx],
        head_dim,
        seq_idx
    );
}

void apply_rope(
    float* d_queries,
    float* d_keys,
    const int batch_size,
    const int seq_len,
    const int num_heads,
    const int head_dim
) {
    dim3 block_size(256);
    dim3 grid_size((batch_size * seq_len * num_heads + block_size.x - 1) / block_size.x);

    rope_kernel<<<grid_size, block_size>>>(
        d_queries,
        d_keys,
        batch_size,
        seq_len,
        num_heads,
        head_dim
    );
}
int main() {
    printf("Starting RoPE kernel test...\n");

    // Define dimensions
    const int batch_size = 1;
    const int seq_len = 4;
    const int num_heads = 2;
    const int head_dim = 8;

    size_t data_size = batch_size * seq_len * num_heads * head_dim * sizeof(float);
    float* h_queries = (float*)malloc(data_size);
    float* h_keys = (float*)malloc(data_size);

    for (int i = 0; i < batch_size * seq_len * num_heads * head_dim; ++i) {
        h_queries[i] = (float)i; // Example initialization
        h_keys[i] = (float)i * 2.0f; // Example initialization
    }

    float* d_queries;
    float* d_keys;
    cudaMalloc(&d_queries, data_size);
    cudaMalloc(&d_keys, data_size);

    cudaMemcpy(d_queries, h_queries, data_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keys, h_keys, data_size, cudaMemcpyHostToDevice);

    apply_rope(d_queries, d_keys, batch_size, seq_len, num_heads, head_dim);

    cudaDeviceSynchronize();

    float* h_queries_out = (float*)malloc(data_size);
    float* h_keys_out = (float*)malloc(data_size);
    cudaMemcpy(h_queries_out, d_queries, data_size, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_keys_out, d_keys, data_size, cudaMemcpyDeviceToHost);


    printf("RoPE kernel test finished.\n");
=
    free(h_queries);
    free(h_keys);
    free(h_queries_out);
    free(h_keys_out);
    cudaFree(d_queries);
    cudaFree(d_keys);

    return 0;
}

Overwriting /content/Rope.cu


In [9]:
!nvcc /content/Rope.cu -o /content/Rope -gencode arch=compute_75,code=sm_75 -lcublas

!/content/Rope

[01m[0m[01m/content/Rope.cu(132)[0m: [01;31merror[0m: expected an expression
  =
  ^

1 error detected in the compilation of "/content/Rope.cu".
Starting RoPE kernel test...
Thread index: 192
Thread index: 193
Thread index: 194
Thread index: 195
Thread index: 196
Thread index: 197
Thread index: 198
Thread index: 199
Thread index: 200
Thread index: 201
Thread index: 202
Thread index: 203
Thread index: 204
Thread index: 205
Thread index: 206
Thread index: 207
Thread index: 208
Thread index: 209
Thread index: 210
Thread index: 211
Thread index: 212
Thread index: 213
Thread index: 214
Thread index: 215
Thread index: 216
Thread index: 217
Thread index: 218
Thread index: 219
Thread index: 220
Thread index: 221
Thread index: 222
Thread index: 223
Thread index: 96
Thread index: 97
Thread index: 98
Thread index: 99
Thread index: 100
Thread index: 101
Thread index: 102
Thread index: 103
Thread index: 104
Thread index: 105
Thread index: 106
Thread index: 107
Thread index: 108
Thread index: 