In [1]:
!pip install pycuda

# Una lista como parámetro

In [None]:
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
import hashlib
import time


def sha256_cuda(data_list, num_zeros=1):
    mod = SourceModule("""
    #include <stdint.h>
    #include <string.h>
    #include <cuda_runtime.h>

    // Constants and initial hash values
    __constant__ uint32_t K[64] = {
        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
    };

    __device__ uint32_t rotr(uint32_t x, uint32_t n) {
        return (x >> n) | (x << (32 - n));
    }

    __device__ uint32_t ch(uint32_t x, uint32_t y, uint32_t z) {
        return (x & y) ^ (~x & z);
    }

    __device__ uint32_t maj(uint32_t x, uint32_t y, uint32_t z) {
        return (x & y) ^ (x & z) ^ (y & z);
    }

    __device__ uint32_t Sigma0(uint32_t x) {
        return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
    }

    __device__ uint32_t Sigma1(uint32_t x) {
        return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
    }

    __device__ uint32_t sigma0(uint32_t x) {
        return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
    }

    __device__ uint32_t sigma1(uint32_t x) {
        return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
    }

    __device__ uint32_t to_little_endian(uint32_t x) {
        return ((x & 0xFF000000) >> 24) | ((x & 0x00FF0000) >> 8) | ((x & 0x0000FF00) << 8) | ((x & 0x000000FF) << 24);
    }

    __device__ uint64_t swap_endian64(uint64_t x) {
        return ((x >> 56) & 0x00000000000000ff) | ((x >> 40) & 0x000000000000ff00) |
              ((x >> 24) & 0x0000000000ff0000) | ((x >> 8) & 0x00000000ff000000) |
              ((x << 8) & 0x000000ff00000000) | ((x << 24) & 0x0000ff0000000000) |
              ((x << 40) & 0x00ff000000000000) | ((x << 56) & 0xff00000000000000);
    }

    __device__ uint32_t swap_endian32(uint32_t x) {
        return ((x >> 24) & 0xff) | // move byte 3 to byte 0
                ((x << 8) & 0xff0000) | // move byte 1 to byte 2
                ((x >> 8) & 0xff00) | // move byte 2 to byte 1
                ((x << 24) & 0xff000000); // byte 0 to byte 3
    }

    __device__ void pad_sha256(const unsigned char* data, uint64_t data_length, unsigned char* padded_data) {
        uint64_t orig_bit_len = data_length * 8;
        uint64_t padded_length = data_length;

        memcpy(padded_data, data, data_length);

        // Padding
        padded_data[padded_length++] = 0x80;
        while (padded_length % 64 != 56) {
            padded_data[padded_length++] = 0x00;
        }

        // Append original length in bits
        uint64_t orig_bit_len_big_endian = swap_endian64(orig_bit_len); // Use device function to convert to big endian
        memcpy(padded_data + padded_length, &orig_bit_len_big_endian, 8);
    }

    __global__ void sha256_kernel(const unsigned char* data_list, uint32_t* hash_list, uint64_t* data_lengths, uint64_t* offsets, uint64_t num_elements) {
        uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x;

        if (idx >= num_elements) {
            return;
        }

        const unsigned char* data = data_list + offsets[idx];
        uint64_t data_length = data_lengths[idx];

        unsigned char padded_data[64];
        pad_sha256(data, data_length, padded_data);

        uint32_t w[64];
        uint32_t a, b, c, d, e, f, g, h;

        // Initialization
        a = 0x6a09e667;
        b = 0xbb67ae85;
        c = 0x3c6ef372;
        d = 0xa54ff53a;
        e = 0x510e527f;
        f = 0x9b05688c;
        g = 0x1f83d9ab;
        h = 0x5be0cd19;

        // Main loop
        for (int t = 0; t < 64; t++) {
            if (t < 16) {
                // Copy data to w
                w[t] = swap_endian32(((uint32_t*)padded_data)[t]); // Use device function to ensure correct endianness
            } else {
                w[t] = sigma1(w[t - 2]) + w[t - 7] + sigma0(w[t - 15]) + w[t - 16];
            }

            uint32_t temp1 = h + Sigma1(e) + ch(e, f, g) + K[t] + w[t];
            uint32_t temp2 = Sigma0(a) + maj(a, b, c);

            h = g;
            g = f;
            f = e;
            e = d + temp1;
            d = c;
            c = b;
            b = a;
            a = temp1 + temp2;
        }

        // Add the compressed chunk to the current hash value
        a += 0x6a09e667;
        b += 0xbb67ae85;
        c += 0x3c6ef372;
        d += 0xa54ff53a;
        e += 0x510e527f;
        f += 0x9b05688c;
        g += 0x1f83d9ab;
        h += 0x5be0cd19;

        // Write the current hash values in big endian
        uint32_t* output = hash_list + idx * 8;
        output[0] = swap_endian32(a);
        output[1] = swap_endian32(b);
        output[2] = swap_endian32(c);
        output[3] = swap_endian32(d);
        output[4] = swap_endian32(e);
        output[5] = swap_endian32(f);
        output[6] = swap_endian32(g);
        output[7] = swap_endian32(h);
    }

    __global__ void filter_hashes(const unsigned char* hash_list, unsigned char* output_hashes, int* output_count, uint64_t num_elements, int num_zeros) {
        uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x;

        if (idx >= num_elements) {
            return;
        }

        const unsigned char* hash = hash_list + idx * 32;  // 256 bits = 32 bytes

        bool isValid = true;

        if (num_zeros > 0) {
            int zero_bytes = num_zeros / 2; // Cada carácter hexadecimal representa 4 bits, por lo tanto, 2 caracteres son 1 byte.
            int extra_zeros = num_zeros % 2; // Si num_zeros no es un número par.

            // Primero, verificamos los bytes completos.
            for (int i = 0; i < zero_bytes; i++) {
                if (hash[31 - i] != 0) {
                    isValid = false;
                    break;
                }
            }

            // Luego, si hay un número impar de caracteres "0" solicitados, verificamos el medio byte.
            if (isValid && extra_zeros > 0) {
                if ((hash[31 - zero_bytes] & 0xF) != 0) { // Verifica los últimos 4 bits del byte actual
                    isValid = false;
                }
            }
        }

        // Limitamos el número de hashes válidos a 10.
        // Usamos atomicAdd para asegurarnos de que la operación sea segura en un entorno multi-hilo.
        // Solo escribimos el hash si aún no hemos alcanzado el límite de 10.
        if (isValid) {
            int old_count = atomicAdd(output_count, 1);
            if (old_count < 10) {
                uint64_t local_offset = old_count * 32;  // Each hash has 32 bytes
                for (int i = 0; i < 32; i++) {
                    output_hashes[local_offset + i] = hash[i];
                }
            }
        }
    }

    """)

    # Preparación de datos y transferencia a la GPU
    concatenated_data = b''.join(data_list)
    data_lengths = np.array([len(data) for data in data_list], dtype=np.uint64)
    offsets = np.cumsum([0] + list(data_lengths[:-1])).astype(np.uint64)

    data_gpu = cuda.mem_alloc(len(concatenated_data))
    data_lengths_gpu = cuda.mem_alloc(data_lengths.nbytes)
    offsets_gpu = cuda.mem_alloc(offsets.nbytes)
    result_gpu = cuda.mem_alloc(32 * len(data_list))  # Cada hash tiene 32 bytes

    MAX_HASHES = 10
    filtered_hashes_gpu = cuda.mem_alloc(32 * MAX_HASHES)  # espacio para hasta 3 hashes
    output_count_gpu = cuda.mem_alloc(np.int32().nbytes)  # solo necesitamos espacio para un entero

    # Copia de datos al dispositivo (GPU)
    cuda.memcpy_htod(data_gpu, concatenated_data)
    cuda.memcpy_htod(data_lengths_gpu, data_lengths)
    cuda.memcpy_htod(offsets_gpu, offsets)
    cuda.memcpy_htod(output_count_gpu, np.array([0], dtype=np.int32))  # inicializar contador en 0

    BLOCK_SIZE = 256
    num_elements = len(data_list)
    grid = ((num_elements + BLOCK_SIZE - 1) // BLOCK_SIZE, 1, 1)  # cálculo del tamaño de la grilla

    # Obtención de funciones del kernel y ejecución
    sha256_kernel = mod.get_function("sha256_kernel")
    filter_hashes = mod.get_function("filter_hashes")

    sha256_kernel(data_gpu, result_gpu, data_lengths_gpu, offsets_gpu, np.uint64(num_elements), block=(BLOCK_SIZE, 1, 1), grid=grid)
    cuda.Context.synchronize()  # Asegura que se completaron todas las operaciones en la GPU

    num_zeros_np = np.int32(num_zeros)
    filter_hashes(result_gpu, filtered_hashes_gpu, output_count_gpu, np.uint64(num_elements), num_zeros_np, block=(BLOCK_SIZE, 1, 1), grid=grid)
    cuda.Context.synchronize()  # Asegura que se completaron todas las operaciones en la GPU

    # Recuperación del número de hashes válidos y los hashes mismos
    output_count = np.empty(1, dtype=np.int32)
    cuda.memcpy_dtoh(output_count, output_count_gpu)
    num_valid_hashes = min(output_count[0], MAX_HASHES)  # No exceder el máximo permitido
    filtered_hashes = np.empty((num_valid_hashes, 32), dtype=np.uint8)

    if num_valid_hashes > 0:  # solo intenta copiar si hay hashes válidos
        cuda.memcpy_dtoh(filtered_hashes, filtered_hashes_gpu)

    # Limpieza: liberar la memoria de la GPU
    data_gpu.free()
    data_lengths_gpu.free()
    offsets_gpu.free()
    result_gpu.free()
    filtered_hashes_gpu.free()
    output_count_gpu.free()

    # Imprimir resultados y devolver hashes
    print(f"{num_valid_hashes} hashes cumplen con la condición.")  # usar num_valid_hashes aquí
    hashes_hex = [''.join(format(byte, '02x') for byte in hash_row) for hash_row in filtered_hashes]
    return hashes_hex


# Función principal para probar la función de hash CUDA
def main():
    num_data = 1000000
    data_array = np.random.randint(0, 256, size=(num_data, 8), dtype=np.uint8)
    data_list = [row.tobytes() for row in data_array]

    start_time = time.time()
    cuda_hashes_hex = sha256_cuda(data_list, 3)
    end_time = time.time()

    # datos = []
    # for data in data_list:
    #     datos.append(hashlib.sha256(data).hexdigest())

    # Mostrar algunos resultados y estadísticas
    for hash_hex in cuda_hashes_hex[:10]:  # Mostrar solo los primeros 10 para ahorrar espacio
        print(hash_hex)
        # if hash_hex in datos:
        #   print("EXIST")
        
    print(f"El programa tomó {end_time - start_time} segundos en ejecutarse para {num_data} elementos.")

if __name__ == "__main__":
    main()
