-
Notifications
You must be signed in to change notification settings - Fork 250
/
online-transducer-model.h
138 lines (114 loc) · 4.63 KB
/
online-transducer-model.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// sherpa-onnx/csrc/online-transducer-model.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_MODEL_H_
#define SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_MODEL_H_
#include <memory>
#include <utility>
#include <vector>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "onnxruntime_cxx_api.h" // NOLINT
#include "sherpa-onnx/csrc/hypothesis.h"
#include "sherpa-onnx/csrc/online-transducer-decoder.h"
#include "sherpa-onnx/csrc/online-transducer-model-config.h"
namespace sherpa_onnx {
struct OnlineTransducerDecoderResult;
class OnlineTransducerModel {
public:
virtual ~OnlineTransducerModel() = default;
static std::unique_ptr<OnlineTransducerModel> Create(
const OnlineTransducerModelConfig &config);
#if __ANDROID_API__ >= 9
static std::unique_ptr<OnlineTransducerModel> Create(
AAssetManager *mgr, const OnlineTransducerModelConfig &config);
#endif
/** Stack a list of individual states into a batch.
*
* It is the inverse operation of `UnStackStates`.
*
* @param states states[i] contains the state for the i-th utterance.
* @return Return a single value representing the batched state.
*/
virtual std::vector<Ort::Value> StackStates(
const std::vector<std::vector<Ort::Value>> &states) const = 0;
/** Unstack a batch state into a list of individual states.
*
* It is the inverse operation of `StackStates`.
*
* @param states A batched state.
* @return ans[i] contains the state for the i-th utterance.
*/
virtual std::vector<std::vector<Ort::Value>> UnStackStates(
const std::vector<Ort::Value> &states) const = 0;
/** Get the initial encoder states.
*
* @return Return the initial encoder state.
*/
virtual std::vector<Ort::Value> GetEncoderInitStates() = 0;
/** Run the encoder.
*
* @param features A tensor of shape (N, T, C). It is changed in-place.
* @param states Encoder state of the previous chunk. It is changed in-place.
*
* @return Return a tuple containing:
* - encoder_out, a tensor of shape (N, T', encoder_out_dim)
* - next_states Encoder state for the next chunk.
*/
virtual std::pair<Ort::Value, std::vector<Ort::Value>> RunEncoder(
Ort::Value features,
std::vector<Ort::Value> states) = 0; // NOLINT
/** Run the decoder network.
*
* Caution: We assume there are no recurrent connections in the decoder and
* the decoder is stateless. See
* https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py
* for an example
*
* @param decoder_input It is usually of shape (N, context_size)
* @return Return a tensor of shape (N, decoder_dim).
*/
virtual Ort::Value RunDecoder(Ort::Value decoder_input) = 0;
/** Run the joint network.
*
* @param encoder_out Output of the encoder network. A tensor of shape
* (N, joiner_dim).
* @param decoder_out Output of the decoder network. A tensor of shape
* (N, joiner_dim).
* @return Return a tensor of shape (N, vocab_size). In icefall, the last
* last layer of the joint network is `nn.Linear`,
* not `nn.LogSoftmax`.
*/
virtual Ort::Value RunJoiner(Ort::Value encoder_out,
Ort::Value decoder_out) = 0;
/** If we are using a stateless decoder and if it contains a
* Conv1D, this function returns the kernel size of the convolution layer.
*/
virtual int32_t ContextSize() const = 0;
/** We send this number of feature frames to the encoder at a time. */
virtual int32_t ChunkSize() const = 0;
/** Number of input frames to discard after each call to RunEncoder.
*
* For instance, if we have 30 frames, chunk_size=8, chunk_shift=6.
*
* In the first call of RunEncoder, we use frames 0~7 since chunk_size is 8.
* Then we discard frame 0~5 since chunk_shift is 6.
* In the second call of RunEncoder, we use frames 6~13; and then we discard
* frames 6~11.
* In the third call of RunEncoder, we use frames 12~19; and then we discard
* frames 12~16.
*
* Note: ChunkSize() - ChunkShift() == right context size
*/
virtual int32_t ChunkShift() const = 0;
virtual int32_t VocabSize() const = 0;
virtual int32_t SubsamplingFactor() const { return 4; }
virtual OrtAllocator *Allocator() = 0;
Ort::Value BuildDecoderInput(
const std::vector<OnlineTransducerDecoderResult> &results);
Ort::Value BuildDecoderInput(const std::vector<Hypothesis> &hyps);
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_MODEL_H_