Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 35 additions & 19 deletions clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -476,11 +476,12 @@ struct CLIPLayer : public GGMLBlock {
public:
CLIPLayer(int64_t d_model,
int64_t n_head,
int64_t intermediate_size)
int64_t intermediate_size,
bool proj_in = false)
: d_model(d_model),
n_head(n_head),
intermediate_size(intermediate_size) {
blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true));
blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true, proj_in));

blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
Expand Down Expand Up @@ -509,11 +510,12 @@ struct CLIPEncoder : public GGMLBlock {
CLIPEncoder(int64_t n_layer,
int64_t d_model,
int64_t n_head,
int64_t intermediate_size)
int64_t intermediate_size,
bool proj_in = false)
: n_layer(n_layer) {
for (int i = 0; i < n_layer; i++) {
std::string name = "layers." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size));
blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size, proj_in));
}
}

Expand Down Expand Up @@ -549,10 +551,10 @@ class CLIPEmbeddings : public GGMLBlock {
int64_t num_positions;
bool force_clip_f32;

void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) {
token_wtype = get_type(prefix + "token_embedding.weight", tensor_types, GGML_TYPE_F32);
token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
if (!support_get_rows(token_wtype)) {
token_wtype = GGML_TYPE_F32;
}
Expand Down Expand Up @@ -605,7 +607,8 @@ class CLIPVisionEmbeddings : public GGMLBlock {
int64_t image_size;
int64_t num_patches;
int64_t num_positions;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {

void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type patch_wtype = GGML_TYPE_F16;
enum ggml_type class_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32;
Expand Down Expand Up @@ -668,7 +671,7 @@ enum CLIPVersion {

class CLIPTextModel : public GGMLBlock {
protected:
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
if (version == OPEN_CLIP_VIT_BIGG_14) {
enum ggml_type wtype = GGML_TYPE_F32;
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
Expand All @@ -689,7 +692,8 @@ class CLIPTextModel : public GGMLBlock {

CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
bool force_clip_f32 = false)
bool force_clip_f32 = false,
bool proj_in = false)
: version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024;
Expand All @@ -704,7 +708,7 @@ class CLIPTextModel : public GGMLBlock {
}

blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
}

Expand Down Expand Up @@ -758,7 +762,7 @@ class CLIPVisionModel : public GGMLBlock {
int32_t n_layer = 24;

public:
CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280;
intermediate_size = 5120;
Expand All @@ -773,7 +777,7 @@ class CLIPVisionModel : public GGMLBlock {

blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
blocks["pre_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
}

Expand Down Expand Up @@ -811,8 +815,8 @@ class CLIPProjection : public UnaryBlock {
int64_t out_features;
bool transpose_weight;

void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
if (transpose_weight) {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
} else {
Expand Down Expand Up @@ -845,15 +849,16 @@ class CLIPVisionModelProjection : public GGMLBlock {

public:
CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool transpose_proj_w = false) {
bool transpose_proj_w = false,
bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280;
projection_dim = 1024;
} else if (version == OPEN_CLIP_VIT_BIGG_14) {
hidden_size = 1664;
}

blocks["vision_model"] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version));
blocks["vision_model"] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version, proj_in));
blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
}

Expand Down Expand Up @@ -881,13 +886,24 @@ struct CLIPTextModelRunner : public GGMLRunner {

CLIPTextModelRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
bool force_clip_f32 = false)
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
model.init(params_ctx, tensor_types, prefix);
: GGMLRunner(backend, offload_params_to_cpu) {
bool proj_in = false;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (contains(name, "self_attn.in_proj")) {
proj_in = true;
break;
}
}
model = CLIPTextModel(version, with_final_ln, force_clip_f32, proj_in);
model.init(params_ctx, tensor_storage_map, prefix);
}

std::string get_desc() override {
Expand Down
59 changes: 41 additions & 18 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ class GEGLU : public UnaryBlock {
int64_t dim_in;
int64_t dim_out;

void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_storage_map, GGML_TYPE_F32);
enum ggml_type bias_wtype = GGML_TYPE_F32;
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
Expand Down Expand Up @@ -408,30 +408,40 @@ class SpatialTransformer : public GGMLBlock {
int64_t d_head;
int64_t depth = 1; // 1
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
bool use_linear = false;

public:
SpatialTransformer(int64_t in_channels,
int64_t n_head,
int64_t d_head,
int64_t depth,
int64_t context_dim)
int64_t context_dim,
bool use_linear)
: in_channels(in_channels),
n_head(n_head),
d_head(d_head),
depth(depth),
context_dim(context_dim) {
// We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
context_dim(context_dim),
use_linear(use_linear) {
// disable_self_attn is always False
int64_t inner_dim = n_head * d_head; // in_channels
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
if (use_linear) {
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
} else {
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
}

for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
}

blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
if (use_linear) {
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
} else {
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
}
}

virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
Expand All @@ -440,8 +450,8 @@ class SpatialTransformer : public GGMLBlock {
// x: [N, in_channels, h, w]
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
auto proj_in = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
auto proj_in = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);

auto x_in = x;
int64_t n = x->ne[3];
Expand All @@ -450,10 +460,15 @@ class SpatialTransformer : public GGMLBlock {
int64_t inner_dim = n_head * d_head;

x = norm->forward(ctx, x);
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]

x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
if (use_linear) {
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
} else {
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
}

for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
Expand All @@ -462,11 +477,19 @@ class SpatialTransformer : public GGMLBlock {
x = transformer_block->forward(ctx, x, context);
}

x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
if (use_linear) {
// proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]

// proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
} else {
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]

// proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
}

x = ggml_add(ctx->ggml_ctx, x, x_in);
return x;
Expand All @@ -475,7 +498,7 @@ class SpatialTransformer : public GGMLBlock {

class AlphaBlender : public GGMLBlock {
protected:
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override {
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
Expand Down
Loading
Loading