intel · Duyi-Wang · May 9, 2024 · Apr 25, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/include/abstract_decoder.h b/include/abstract_decoder.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cstdint>
 #include <tuple>
+#include "sequence.h"
 
 class DecoderContext;
 class Messenger;
@@ -35,6 +36,8 @@ class AbstractDecoder {
     //                    |<----------------------- vocabSize  ----------------------------->|
     virtual std::tuple<float *, int, int> forward(int *ids, int64_t *dims, int step, bool logits_all = false) = 0;
 
+    virtual std::tuple<float *, int, int> forward(std::vector<xft::SequenceMeta *> &seq, bool logits_all = false) = 0;
+
     // Reorder cached keys and values, size=batchSize*beamSize
     virtual void reorderCache(int *idx, int size) = 0;
 

diff --git a/include/models.h b/include/models.h
@@ -36,9 +36,17 @@ class Model {
 
     void config(SearcherConfig &config_, const std::vector<std::vector<int>> &stopWordsList_ = {});
 
+    void set_input(std::vector<int32_t> &inputIds_, int batchSize_, int maxLen_ = -1, int numBeams_ = 1,
+            int numBeamHypsToKeep_ = 1, float lenPenalty_ = 1.0, bool doEarlyStopping_ = false, int eosTokenId_ = -1,
+            int padTokenId_ = -1, bool doSample_ = false, float temperature_ = 1.0, int topK_ = 50, float topP_ = 1.0,
+            float repetitionPenalty_ = 1.0, const std::vector<std::vector<int>> &stopWordsList_ = {});
+
+    void set_input(std::vector<int32_t> &inputIds_, int batchSize_, SearcherConfig &config_,
+            const std::vector<std::vector<int>> &stopWordsList_ = {});
+
     bool isDone();
 
-    std::tuple<float *, int, int> forward();
+    std::tuple<float *, int, int> forward(bool logits_all = true);
 
     std::vector<int32_t> generate();
 
@@ -79,6 +87,7 @@ class Model {
     int vocabSize;
     SearcherConfig configuration;
     bool isNewInput;
+    std::vector<SequenceGroupMeta *> workingGroup;
 };
 
 class AutoModel : public Model {

diff --git a/src/common/attn_metadata.h b/src/common/attn_metadata.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
+#pragma once
+
+class AttnMetaData {
+
+public:
+    AttnMetaData () : batchSize(0), attnMask(nullptr) {}
+
+    AttnMetaData (int batchSize, int *inputTokenSizes, int *pastTokenSizes, bool isPrompt, bool isCausal, float *attnMask = nullptr)
+        : batchSize(batchSize), isPrompt(isPrompt), isCausal(isCausal), attnMask(attnMask) {
+        // causal=True, no need mask
+        assert(isCausal && attnMask == nullptr)
+        // causal=False, need mask
+        assert(!isCausal && attnMask)
+
+        // fill inputSeqLens, pastSeqLens, seqStartLoc
+        inputSeqLens.resize(batchSize);
+        pastSeqLens.resize(batchSize);
+        seqStartLoc.resize(batchSize + 1);
+
+        seqStartLoc[0] = 0;
+        for (int i = 0; i < batchSize; i++) {
+            inputSeqLens[i] = inputTokenSizes[i];
+            pastSeqLens[i] = pastTokenSizes[i];
+            seqStartLoc[i + 1] = seqStartLoc[i] + inputSeqLens[i];
+        }
+
+    AttnMetaData (vector<int> &inputTokens, vector<int> &pastTokens, bool isPrompt, bool isCausal, float *attnMask = nullptr)
+        : batchSize(inputTokenSizes.size()), isPrompt(isPrompt), isCausal(isCausal), attnMask(attnMask), 
+            inputSeqLens(inputTokenSizes), pastSeqLens(pastTokenSizes){
+        // causal=True, no need mask
+        assert(isCausal && attnMask == nullptr)
+        // causal=False, need mask
+        assert(!isCausal && attnMask)
+
+        // fill seqStartLoc
+        seqStartLoc.resize(batchSize + 1);
+
+        seqStartLoc[0] = 0;
+        for (int i = 0; i < batchSize; i++) {
+            seqStartLoc[i + 1] = seqStartLoc[i] + inputSeqLens[i];
+        }
+
+    }
+
+private:
+    bool isPrompt;
+    bool isCausal;
+
+    int batchSize;
+    std::vector<int> inputSeqLens;
+    std::vector<int> pastSeqLens;
+    std::vector<int> seqStartLoc;
+
+    float *attnMask;
+
+};
diff --git a/src/common/kvcache_mgr.h b/src/common/kvcache_mgr.h
@@ -0,0 +1,216 @@
+// Copyright (c) 2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
+#pragma once
+
+#include <vector>
+#include "kvcache_tensor.h"
+#include <unordered_map>
+
+namespace xft {
+
+class KVCacheMgrImplBase {
+public:
+    virtual ~KVCacheMgrImplBase() = default;
+    virtual bool delSequence(int seqID) = 0;
+    virtual bool addSequence(int seqID, int prefixId = -1) = 0;
+    virtual bool reorderCache(const std::vector<int> &seqIDs, const std::vector<int> &prevSeqIDs) = 0;
+    virtual bool addPrefix(int prefixId, int seqID) = 0;
+    virtual bool prepareCache(const std::vector<int> &seqIDs) = 0;
+    virtual std::vector<void *> getKey(int layerId) = 0;
+    virtual std::vector<void *> getValue(int layerId) = 0;
+};
+
+template <typename T>
+class KVCacheMgrImpl : public KVCacheMgrImplBase {
+public:
+    KVCacheMgrImpl(int layers) { this->layers = layers; }
+
+    ~KVCacheMgrImpl() {
+        // Free resource in cachePool (readyCaches are in cachePool too)
+        for (auto &it : sequenceCaches) {
+            delete it.second;
+        }
+        // Free resource in prefixCaches
+        for (auto &it : prefixCaches) {
+            delete it.second;
+        }
+        // Free resource in freeCaches
+        for (auto &it : freeCaches) {
+            delete it;
+        }
+    }
+
+    // Free KVCache by sample ID.
+    bool delSequence(int seqID) override {
+        auto it = sequenceCaches.find(seqID);
+
+        // Fail if not exist
+        if (it == sequenceCaches.end()) { return false; }
+
+        // Move from sequenceCaches to freeCaches
+        freeCaches.push_back(it->second);
+
+        sequenceCaches.erase(it);
+
+        return true;
+    }
+
+    bool addSequence(int seqID, int prefixId = -1) override {
+        // Fail if already exist
+        if (sequenceCaches.find(seqID) != sequenceCaches.end()) { return false; }
+
+        // Get a free cache or create a new one
+        KVCacheTensor<T> *cache = nullptr;
+        if (!freeCaches.empty()) {
+            cache = freeCaches.back();
+            freeCaches.pop_back();
+        } else {
+            cache = new KVCacheTensor<T>[2 * layers];
+        }
+
+        sequenceCaches.insert({seqID, cache});
+
+        return true;
+    }
+
+    // Reorder cache based on prevSeqIDs for beam search (caches reordered from prevSeqIDs to seqIDs)
+    // For example, if seqIDs = {1, 2, 3, 4} and prevSeqIDs = {1, 1, 1, 1}, then means to expand cache for sample 1
+    bool reorderCache(const std::vector<int> &seqIDs, const std::vector<int> &prevSeqIDs) override {
+        // TODO: implement reorderCache
+        return false;
+    }
+
+    // Create KVCache for prefix sharing
+    bool addPrefix(int prefixId, int seqID) override {
+        // Fail if already exist
+        if (prefixCaches.find(prefixId) != prefixCaches.end()) { return false; }
+
+        // Cannot find the sample cache
+        if (sequenceCaches.find(seqID) == sequenceCaches.end()) { return false; }
+
+        // Create a new one
+        KVCacheTensor<T> *cache = new KVCacheTensor<T>[2 * layers];
+
+        for (int i = 0; i < 2 * layers; i++) {
+            // TODO: add from method in KVCacheTensor
+            //cache[i].from(sequenceCaches[seqID][i]);
+        }
+
+        prefixCaches.insert({prefixId, cache});
+
+        return true;
+    }
+
+    // Set cache to be ready for this order of sampleIds
+    bool prepareCache(const std::vector<int> &seqIDs) override {
+        std::vector<KVCacheTensor<T> *> readyList;
+        readyList.reserve(seqIDs.size());
+
+        for (auto seqID : seqIDs) {
+            auto it = sequenceCaches.find(seqID);
+            if (it == sequenceCaches.end()) { return false; }
+            readyList.push_back(it->second);
+        }
+
+        readyCaches = std::move(readyList);
+
+        return true;
+    }
+
+    // Get key caches for a layer
+    std::vector<void *> getKey(int layerId) override {
+        std::vector<void *> keyCaches;
+        keyCaches.reserve(readyCaches.size());
+        for (auto cache : readyCaches) {
+            keyCaches.push_back(&cache[2 * layerId]);
+        }
+        return keyCaches;
+    }
+
+    // Get value caches for a layer
+    std::vector<void *> getValue(int layerId) override {
+        std::vector<void *> valueCaches;
+        valueCaches.reserve(readyCaches.size());
+        for (auto cache : readyCaches) {
+            valueCaches.push_back(&cache[2 * layerId + 1]);
+        }
+        return valueCaches;
+    }
+
+private:
+    // seqID -> pointer to an array of caches (each element is a KVCacheTensor, size=2*layers)
+    // Layout of each array is:
+    //     <Key cache for layer 0>
+    //     <Value cache for layer 0>
+    //     <Key cache for layer 1>
+    //     <Value cache for layer 1>
+    //     ...
+    std::unordered_map<int, KVCacheTensor<T> *> sequenceCaches;
+
+    // prefixID -> pointer to an array of caches (each element is a KVCacheTensor, size=2*layers)
+    std::unordered_map<int, KVCacheTensor<T> *> prefixCaches;
+
+    // List of ready caches, each element is for a sample; subset of sequenceCaches
+    std::vector<KVCacheTensor<T> *> readyCaches;
+
+    // List of pending free caches, each element is for a sample
+    std::vector<KVCacheTensor<T> *> freeCaches;
+
+    int layers;
+};
+
+class KVCacheMgr {
+public:
+    static KVCacheMgr &instance() {
+        static KVCacheMgr inst;
+        return inst;
+    }
+
+    void configure(int layers, DataType dataType) {
+        switch (dataType) {
+            case DataType::int8: cacheMgrImpl = new KVCacheMgrImpl<int8_t>(layers); break;
+            case DataType::fp16: cacheMgrImpl = new KVCacheMgrImpl<float16_t>(layers); break;
+            default: cacheMgrImpl = new KVCacheMgrImpl<float16_t>(layers); break;
+        }
+    }
+
+    bool delSequence(int seqID) { return cacheMgrImpl->delSequence(seqID); }
+
+    bool addSequence(int seqID, int prefixId = -1) { return cacheMgrImpl->addSequence(seqID, prefixId); }
+
+    bool reorderCache(const std::vector<int> &seqIDs, const std::vector<int> &prevSeqIDs) {
+        return cacheMgrImpl->reorderCache(seqIDs, prevSeqIDs);
+    }
+
+    bool addPrefix(int prefixId, int seqID) { return cacheMgrImpl->addPrefix(prefixId, seqID); }
+
+    bool prepareCache(const std::vector<int> &seqIDs) { return cacheMgrImpl->prepareCache(seqIDs); }
+
+    std::vector<void *> getKey(int layerId) { return cacheMgrImpl->getKey(layerId); }
+
+    std::vector<void *> getValue(int layerId) { return cacheMgrImpl->getValue(layerId); }
+
+private:
+    KVCacheMgrImplBase *cacheMgrImpl;
+
+    KVCacheMgr() : cacheMgrImpl(nullptr) {}
+
+    ~KVCacheMgr() { delete cacheMgrImpl; }
+
+    KVCacheMgr(const KVCacheMgr &) = delete;
+    KVCacheMgr &operator=(const KVCacheMgr &) = delete;
+};
+
+} // namespace xft
diff --git a/src/common/my_types.h b/src/common/my_types.h
@@ -36,7 +36,7 @@ void *xft_numa_alloc(size_t size);
 void xft_numa_free(void *start, size_t size);
 }
 
-namespace hpj {
+namespace xft {
 
 template <typename T>
 struct is_quantization_type {
@@ -366,4 +366,4 @@ class Vector {
     }
     uint64_t Size() { return size; }
 };
-} // namespace hpj
+} // namespace xft