Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ if(SYCL_INTEL_TARGET)
add_subdirectory(08_bmg_gemm_f8)
add_subdirectory(09_bmg_grouped_gemm_f8)
add_subdirectory(10_bmg_grouped_gemm_mixed_dtype)
add_subdirectory(sdpa_bwd)
endif()

if (CUTLASS_ENABLE_SYCL)
Expand Down
10 changes: 10 additions & 0 deletions examples/sycl/sdpa_bwd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
set(TEST_GROUPS --groups=128)

cutlass_example_add_executable(
sdpa_backward
sdpa_backward.cpp
cnpy.cpp
)
target_link_options(sdpa_backward PUBLIC "-lz")
set_target_properties(sdpa_backward PROPERTIES CXX_COMPILER_LAUNCHER "IGC_VectorAliasBBThreshold=10000" )
set_target_properties(sdpa_backward PROPERTIES CXX_LINKER_LAUNCHER "IGC_VectorAliasBBThreshold=10000" )
340 changes: 340 additions & 0 deletions examples/sycl/sdpa_bwd/cnpy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
//Copyright (C) 2011 Carl Rogers
//Released under MIT License
//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php

#include"cnpy.h"
#include<complex>
#include<cstdlib>
#include<algorithm>
#include<cstring>
#include<iomanip>
#include<stdint.h>
#include<stdexcept>
#include <regex>

char cnpy::BigEndianTest() {
int x = 1;
return (((char *)&x)[0]) ? '<' : '>';
}

char cnpy::map_type(const std::type_info& t)
{
if(t == typeid(float) ) return 'f';
if(t == typeid(double) ) return 'f';
if(t == typeid(long double) ) return 'f';

if(t == typeid(int) ) return 'i';
if(t == typeid(char) ) return 'i';
if(t == typeid(short) ) return 'i';
if(t == typeid(long) ) return 'i';
if(t == typeid(long long) ) return 'i';

if(t == typeid(unsigned char) ) return 'u';
if(t == typeid(unsigned short) ) return 'u';
if(t == typeid(unsigned long) ) return 'u';
if(t == typeid(unsigned long long) ) return 'u';
if(t == typeid(unsigned int) ) return 'u';

if(t == typeid(bool) ) return 'b';

if(t == typeid(std::complex<float>) ) return 'c';
if(t == typeid(std::complex<double>) ) return 'c';
if(t == typeid(std::complex<long double>) ) return 'c';

else return '?';
}

template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const std::string rhs) {
lhs.insert(lhs.end(),rhs.begin(),rhs.end());
return lhs;
}

template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const char* rhs) {
//write in little endian
size_t len = strlen(rhs);
lhs.reserve(len);
for(size_t byte = 0; byte < len; byte++) {
lhs.push_back(rhs[byte]);
}
return lhs;
}

void cnpy::parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector<size_t>& shape, bool& fortran_order) {
//std::string magic_string(buffer,6);
uint8_t major_version = *reinterpret_cast<uint8_t*>(buffer+6);
uint8_t minor_version = *reinterpret_cast<uint8_t*>(buffer+7);
uint16_t header_len = *reinterpret_cast<uint16_t*>(buffer+8);
std::string header(reinterpret_cast<char*>(buffer+9),header_len);

size_t loc1, loc2;

//fortran order
loc1 = header.find("fortran_order")+16;
fortran_order = (header.substr(loc1,4) == "True" ? true : false);

//shape
loc1 = header.find("(");
loc2 = header.find(")");

std::regex num_regex("[0-9][0-9]*");
std::smatch sm;
shape.clear();

std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
while(std::regex_search(str_shape, sm, num_regex)) {
shape.push_back(std::stoi(sm[0].str()));
str_shape = sm.suffix().str();
}

//endian, word size, data type
//byte order code | stands for not applicable.
//not sure when this applies except for byte array
loc1 = header.find("descr")+9;
bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
assert(littleEndian);

//char type = header[loc1+1];
//assert(type == map_type(T));

std::string str_ws = header.substr(loc1+2);
loc2 = str_ws.find("'");
word_size = atoi(str_ws.substr(0,loc2).c_str());
}

void cnpy::parse_npy_header(FILE* fp, size_t& word_size, std::vector<size_t>& shape, bool& fortran_order) {
char buffer[256];
size_t res = fread(buffer,sizeof(char),11,fp);
if(res != 11)
throw std::runtime_error("parse_npy_header: failed fread");
std::string header = fgets(buffer,256,fp);
assert(header[header.size()-1] == '\n');

size_t loc1, loc2;

//fortran order
loc1 = header.find("fortran_order");
if (loc1 == std::string::npos)
throw std::runtime_error("parse_npy_header: failed to find header keyword: 'fortran_order'");
loc1 += 16;
fortran_order = (header.substr(loc1,4) == "True" ? true : false);

//shape
loc1 = header.find("(");
loc2 = header.find(")");
if (loc1 == std::string::npos || loc2 == std::string::npos)
throw std::runtime_error("parse_npy_header: failed to find header keyword: '(' or ')'");

std::regex num_regex("[0-9][0-9]*");
std::smatch sm;
shape.clear();

std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
while(std::regex_search(str_shape, sm, num_regex)) {
shape.push_back(std::stoi(sm[0].str()));
str_shape = sm.suffix().str();
}

//endian, word size, data type
//byte order code | stands for not applicable.
//not sure when this applies except for byte array
loc1 = header.find("descr");
if (loc1 == std::string::npos)
throw std::runtime_error("parse_npy_header: failed to find header keyword: 'descr'");
loc1 += 9;
bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
assert(littleEndian);

//char type = header[loc1+1];
//assert(type == map_type(T));

std::string str_ws = header.substr(loc1+2);
loc2 = str_ws.find("'");
word_size = atoi(str_ws.substr(0,loc2).c_str());
}

void cnpy::parse_zip_footer(FILE* fp, uint16_t& nrecs, size_t& global_header_size, size_t& global_header_offset)
{
std::vector<char> footer(22);
fseek(fp,-22,SEEK_END);
size_t res = fread(&footer[0],sizeof(char),22,fp);
if(res != 22)
throw std::runtime_error("parse_zip_footer: failed fread");

uint16_t disk_no, disk_start, nrecs_on_disk, comment_len;
disk_no = *(uint16_t*) &footer[4];
disk_start = *(uint16_t*) &footer[6];
nrecs_on_disk = *(uint16_t*) &footer[8];
nrecs = *(uint16_t*) &footer[10];
global_header_size = *(uint32_t*) &footer[12];
global_header_offset = *(uint32_t*) &footer[16];
comment_len = *(uint16_t*) &footer[20];

assert(disk_no == 0);
assert(disk_start == 0);
assert(nrecs_on_disk == nrecs);
assert(comment_len == 0);
}

cnpy::NpyArray load_the_npy_file(FILE* fp) {
std::vector<size_t> shape;
size_t word_size;
bool fortran_order;
cnpy::parse_npy_header(fp,word_size,shape,fortran_order);

cnpy::NpyArray arr(shape, word_size, fortran_order);
size_t nread = fread(arr.data<char>(),1,arr.num_bytes(),fp);
if(nread != arr.num_bytes())
throw std::runtime_error("load_the_npy_file: failed fread");
return arr;
}

cnpy::NpyArray load_the_npz_array(FILE* fp, uint32_t compr_bytes, uint32_t uncompr_bytes) {

std::vector<unsigned char> buffer_compr(compr_bytes);
std::vector<unsigned char> buffer_uncompr(uncompr_bytes);
size_t nread = fread(&buffer_compr[0],1,compr_bytes,fp);
if(nread != compr_bytes)
throw std::runtime_error("load_the_npy_file: failed fread");

int err;
z_stream d_stream;

d_stream.zalloc = Z_NULL;
d_stream.zfree = Z_NULL;
d_stream.opaque = Z_NULL;
d_stream.avail_in = 0;
d_stream.next_in = Z_NULL;
err = inflateInit2(&d_stream, -MAX_WBITS);

d_stream.avail_in = compr_bytes;
d_stream.next_in = &buffer_compr[0];
d_stream.avail_out = uncompr_bytes;
d_stream.next_out = &buffer_uncompr[0];

err = inflate(&d_stream, Z_FINISH);
err = inflateEnd(&d_stream);

std::vector<size_t> shape;
size_t word_size;
bool fortran_order;
cnpy::parse_npy_header(&buffer_uncompr[0],word_size,shape,fortran_order);

cnpy::NpyArray array(shape, word_size, fortran_order);

size_t offset = uncompr_bytes - array.num_bytes();
memcpy(array.data<unsigned char>(),&buffer_uncompr[0]+offset,array.num_bytes());

return array;
}

cnpy::npz_t cnpy::npz_load(std::string fname) {
FILE* fp = fopen(fname.c_str(),"rb");

if(!fp) {
throw std::runtime_error("npz_load: Error! Unable to open file "+fname+"!");
}

cnpy::npz_t arrays;

while(1) {
std::vector<char> local_header(30);
size_t headerres = fread(&local_header[0],sizeof(char),30,fp);
if(headerres != 30)
throw std::runtime_error("npz_load: failed fread");

//if we've reached the global header, stop reading
if(local_header[2] != 0x03 || local_header[3] != 0x04) break;

//read in the variable name
uint16_t name_len = *(uint16_t*) &local_header[26];
std::string varname(name_len,' ');
size_t vname_res = fread(&varname[0],sizeof(char),name_len,fp);
if(vname_res != name_len)
throw std::runtime_error("npz_load: failed fread");

//erase the lagging .npy
varname.erase(varname.end()-4,varname.end());

//read in the extra field
uint16_t extra_field_len = *(uint16_t*) &local_header[28];
if(extra_field_len > 0) {
std::vector<char> buff(extra_field_len);
size_t efield_res = fread(&buff[0],sizeof(char),extra_field_len,fp);
if(efield_res != extra_field_len)
throw std::runtime_error("npz_load: failed fread");
}

uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0]+8);
uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);

if(compr_method == 0) {arrays[varname] = load_the_npy_file(fp);}
else {arrays[varname] = load_the_npz_array(fp,compr_bytes,uncompr_bytes);}
}

fclose(fp);
return arrays;
}

cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {
FILE* fp = fopen(fname.c_str(),"rb");

if(!fp) throw std::runtime_error("npz_load: Unable to open file "+fname);

while(1) {
std::vector<char> local_header(30);
size_t header_res = fread(&local_header[0],sizeof(char),30,fp);
if(header_res != 30)
throw std::runtime_error("npz_load: failed fread");

//if we've reached the global header, stop reading
if(local_header[2] != 0x03 || local_header[3] != 0x04) break;

//read in the variable name
uint16_t name_len = *(uint16_t*) &local_header[26];
std::string vname(name_len,' ');
size_t vname_res = fread(&vname[0],sizeof(char),name_len,fp);
if(vname_res != name_len)
throw std::runtime_error("npz_load: failed fread");
vname.erase(vname.end()-4,vname.end()); //erase the lagging .npy

//read in the extra field
uint16_t extra_field_len = *(uint16_t*) &local_header[28];
fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field

uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0]+8);
uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);

if(vname == varname) {
NpyArray array = (compr_method == 0) ? load_the_npy_file(fp) : load_the_npz_array(fp,compr_bytes,uncompr_bytes);
fclose(fp);
return array;
}
else {
//skip past the data
uint32_t size = *(uint32_t*) &local_header[22];
fseek(fp,size,SEEK_CUR);
}
}

fclose(fp);

//if we get here, we haven't found the variable in the file
throw std::runtime_error("npz_load: Variable name "+varname+" not found in "+fname);
}

cnpy::NpyArray cnpy::npy_load(std::string fname) {

FILE* fp = fopen(fname.c_str(), "rb");

if(!fp) throw std::runtime_error("npy_load: Unable to open file "+fname);

NpyArray arr = load_the_npy_file(fp);

fclose(fp);
return arr;
}



Loading