@@ -48,6 +48,7 @@ def bytes_to_unicode():
48
48
49
49
50
50
class SentencePieceVocab :
51
+
51
52
def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
52
53
self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
53
54
added_tokens : Dict [str , int ]
@@ -116,8 +117,7 @@ def load_vocab_for_baichuan(path: Path) -> SentencePieceVocab:
116
117
else :
117
118
raise FileNotFoundError (
118
119
f"Could not find tokenizer.model in { path } or its parent; if it's in another directory, \
119
- pass the directory as --vocab-dir"
120
- )
120
+ pass the directory as --vocab-dir" )
121
121
added_tokens_path = path .parent / "added_tokens.json"
122
122
print (f"Loading vocab file { path } " )
123
123
return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None )
@@ -161,9 +161,112 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
161
161
fout .write (struct .pack ("f" , 10000.0 )) # freq_base
162
162
fout .write (struct .pack ("f" , 1.0 )) # rope_factor
163
163
164
- fout .write (struct .pack ("f" , 0.0 )) # config.json "rope_scaling.factor", not enabled
165
- fout .write (struct .pack ("i" , 0 )) # rope_scaling.original_max_position_embeddings
166
- fout .write (struct .pack ("i" , 0 )) # params["rope_scaling"]["type"] =="yarn" else 0))
164
+ fout .write (struct .pack ("f" , 0.0 )) # config.json "rope_scaling.factor", not enabled
165
+ fout .write (struct .pack ("i" , 0 )) # rope_scaling.original_max_position_embeddings
166
+ fout .write (struct .pack ("i" , 0 )) # params["rope_scaling"]["type"] =="yarn" else 0))
167
+
168
+ fout .write (struct .pack ("i" , tokenizer .bos_token_id if tokenizer .bos_token_id is not None else 1 ))
169
+ fout .write (struct .pack ("i" , tokenizer .eos_token_id if tokenizer .eos_token_id is not None else 2 ))
170
+ fout .write (struct .pack ("i" , tokenizer .pad_token_id if tokenizer .pad_token_id is not None else - 1 ))
171
+ fout .write (struct .pack ("i" , tokenizer .sep_token_id if tokenizer .sep_token_id is not None else - 1 ))
172
+
173
+ tokenizer_path = Path (tokenizer .vocab_file ).parent
174
+ vocab = load_vocab_for_baichuan (Path (tokenizer_path ))
175
+ counter = 0
176
+ for text , score in vocab .all_tokens ():
177
+ fout .write (struct .pack ("i" , len (text )))
178
+ fout .write (text )
179
+ fout .write (struct .pack ("f" , score ))
180
+ counter += 1
181
+
182
+ while counter < hparams ["vocab_size" ]:
183
+ fout .write (struct .pack ("i" , len (text )))
184
+ fout .write (text )
185
+ fout .write (struct .pack ("f" , 0 ))
186
+ counter += 1
187
+
188
+ for name in list_vars .keys ():
189
+ data = list_vars [name ].squeeze ().numpy ()
190
+ print ("Processing variable: " + name + " with shape: " , data .shape )
191
+ if 'inv_freq' in name :
192
+ continue
193
+
194
+ n_dims = len (data .shape )
195
+
196
+ # ftype == 0 -> float32, ftype == 1 -> float16
197
+ ftype_cur = 0
198
+ if ftype != 0 :
199
+ if name [- 7 :] == ".weight" and n_dims == 2 :
200
+ print (" Converting to float16" )
201
+ data = data .astype (np .float16 )
202
+ ftype_cur = 14
203
+ else :
204
+ print (" Converting to float32" )
205
+ data = data .astype (np .float32 )
206
+ ftype_cur = 0
207
+ else :
208
+ if data .dtype != np .float32 :
209
+ print (" Converting to float32" )
210
+ data = data .astype (np .float32 )
211
+ ftype_cur = 0
212
+
213
+ # header
214
+ str = name .encode ("utf-8" )
215
+ fout .write (struct .pack ("iii" , n_dims , len (str ), ftype_cur ))
216
+ for i in range (n_dims ):
217
+ fout .write (struct .pack ("i" , data .shape [n_dims - 1 - i ]))
218
+ fout .write (str )
219
+
220
+ # data
221
+ data .tofile (fout )
222
+
223
+ fout .close ()
224
+
225
+ print ("Done. Output file: " + fname_out )
226
+ print ("" )
227
+
228
+
229
+ def baichuan7B_convert (model , tokenizer , dir_model , fname_out , ftype , hparams ):
230
+ print ("Baichuan-7B converting: " )
231
+ list_vars = model .state_dict ()
232
+ for name in list_vars .keys ():
233
+ print (name , list_vars [name ].shape , list_vars [name ].dtype )
234
+
235
+ fout = open (fname_out , "wb" )
236
+
237
+ print (hparams )
238
+
239
+ fout .write (struct .pack ("i" , 0x67676d66 ))
240
+ fout .write (struct .pack ("i" , 1 ))
241
+
242
+ fout .write (struct .pack ("i" , hparams ["vocab_size" ]))
243
+ fout .write (struct .pack ("i" , hparams ["hidden_size" ]))
244
+ fout .write (struct .pack ("i" , 0 ))
245
+ fout .write (struct .pack ("i" , hparams ["num_attention_heads" ]))
246
+ fout .write (struct .pack ("i" , 0 ))
247
+ fout .write (struct .pack ("i" , hparams ["num_hidden_layers" ]))
248
+ fout .write (struct .pack ("i" , 128 ))
249
+ fout .write (struct .pack ("i" , ftype ))
250
+ fout .write (struct .pack ("i" , hparams ["model_max_length" ]))
251
+ fout .write (struct .pack ("f" , 0 ))
252
+ fout .write (struct .pack ("f" , 0 ))
253
+ fout .write (struct .pack ("i" , 0 ))
254
+
255
+ fout .write (struct .pack ("i" , 0 )) # word_embed_proj_dim (for opt)
256
+ fout .write (struct .pack ("i" , 0 )) # do_layer_norm_before (for opt)
257
+
258
+ fout .write (struct .pack ("i" , 0 ))
259
+ fout .write (struct .pack ("i" , 0 ))
260
+ fout .write (struct .pack ("i" , hparams ["intermediate_size" ]))
261
+ fout .write (struct .pack ("i" , 0 )) # n_experts
262
+ fout .write (struct .pack ("i" , 0 )) # n_expert_used
263
+ fout .write (struct .pack ("f" , hparams .get ("rms_norm_eps" , 1e-6 ))) # rms_norm_eps or layer_norm_eps
264
+ fout .write (struct .pack ("f" , 10000.0 )) # freq_base
265
+ fout .write (struct .pack ("f" , 1.0 )) # rope_factor
266
+
267
+ fout .write (struct .pack ("f" , 0.0 )) # config.json "rope_scaling.factor", not enabled
268
+ fout .write (struct .pack ("i" , 0 )) # rope_scaling.original_max_position_embeddings
269
+ fout .write (struct .pack ("i" , 0 )) # params["rope_scaling"]["type"] =="yarn" else 0))
167
270
168
271
fout .write (struct .pack ("i" , tokenizer .bos_token_id if tokenizer .bos_token_id is not None else 1 ))
169
272
fout .write (struct .pack ("i" , tokenizer .eos_token_id if tokenizer .eos_token_id is not None else 2 ))
@@ -230,8 +333,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
230
333
parser = argparse .ArgumentParser (description = "Convert a model to a NE compatible file" )
231
334
parser .add_argument ("--outtype" , choices = ["f32" , "f16" ], help = "output format (default: based on input)" )
232
335
parser .add_argument ("--outfile" , type = Path , help = "path to write to; default: based on input" )
233
- parser .add_argument ("--model_hub" , choices = ["huggingface" ,"modelscope" ],
234
- default = "huggingface" , help = "hub to load model" )
336
+ parser .add_argument ("--model_hub" ,
337
+ choices = ["huggingface" , "modelscope" ],
338
+ default = "huggingface" ,
339
+ help = "hub to load model" )
235
340
parser .add_argument ("model" , type = Path , help = "directory containing model file" )
236
341
args = parser .parse_args (args_in )
237
342
@@ -255,7 +360,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
255
360
256
361
hparams = config .to_dict ()
257
362
258
- baichuan13B_convert (model , tokenizer , dir_model , fname_out , ftype , hparams )
363
+ if hparams ['hidden_size' ] == 4096 :
364
+ baichuan7B_convert (model , tokenizer , dir_model , fname_out , ftype , hparams )
365
+ else :
366
+ baichuan13B_convert (model , tokenizer , dir_model , fname_out , ftype , hparams )
259
367
260
368
261
369
if __name__ == '__main__' :
0 commit comments