In [12]:
import os
import json
from collections import defaultdict

import numpy as np
import spacy
from tqdm import tqdm

In [2]:
train_labels_path = 'data/deepfashion-multimodal/train_captions.json'
with open(train_labels_path, 'rb') as fp:
    train_labels = json.load(fp)

In [3]:
nlp = spacy.load("en_core_web_sm")

word_counts = defaultdict(int)

word2vec = {}
doc_lengths = []
for _, text_label in tqdm(train_labels.items(), desc='Building vocabulary'):
    doc = nlp(text_label)
    doc_lengths.append(len(doc))
    for token in doc:
        word = token.text.lower()
        word_counts[word] += 1

Building vocabulary: 100%|██████████| 10155/10155 [00:58<00:00, 174.68it/s]


In [4]:
max(doc_lengths)  # 最大的文档长度是109

109

In [6]:
vocabulary = {k: i+1 for i, k in enumerate(word_counts.keys())}
vocabulary['<pad>'] = 0
vocabulary['<unk>'] = len(vocabulary)
vocabulary['<start>'] = len(vocabulary)
vocabulary['<end>'] = len(vocabulary)
vocabulary_inv = {v: k for k, v in vocabulary.items()}

In [10]:
embed_dim = 96

In [15]:
vecs = []

for i in range(len(vocabulary)):
    word = vocabulary_inv[i]
    if word in word2vec:
        vecs.append(word2vec[word])
    else:
        vecs.append(np.random.rand(96))

vecs_arr = np.array(vecs)
np.save('word2vec.npy', vecs_arr)

In [14]:
vecs_arr.shape

(110, 96)

In [37]:
# 存储词典
with open(os.path.join('data/deepfashion-multimodal', 'vocab.json'), 'w') as fw:
    json.dump(vocabulary, fw)

In [16]:
vocabulary

{'the': 1,
 'upper': 2,
 'clothing': 3,
 'has': 4,
 'long': 5,
 'sleeves': 6,
 ',': 7,
 'cotton': 8,
 'fabric': 9,
 'and': 10,
 'solid': 11,
 'color': 12,
 'patterns': 13,
 '.': 14,
 'neckline': 15,
 'of': 16,
 'it': 17,
 'is': 18,
 'v': 19,
 '-': 20,
 'shape': 21,
 'lower': 22,
 'length': 23,
 'denim': 24,
 'this': 25,
 'lady': 26,
 'also': 27,
 'wears': 28,
 'an': 29,
 'outer': 30,
 'with': 31,
 'complicated': 32,
 'female': 33,
 'wearing': 34,
 'a': 35,
 'ring': 36,
 'on': 37,
 'her': 38,
 'finger': 39,
 'neckwear': 40,
 'tank': 41,
 'shirt': 42,
 'no': 43,
 'chiffon': 44,
 'graphic': 45,
 'round': 46,
 'person': 47,
 'pants': 48,
 'are': 49,
 'top': 50,
 'woman': 51,
 'trousers': 52,
 'there': 53,
 'belt': 54,
 'accessory': 55,
 'wrist': 56,
 'sweater': 57,
 'lattice': 58,
 'three': 59,
 'point': 60,
 'pure': 61,
 'in': 62,
 'his': 63,
 'neck': 64,
 'sleeve': 65,
 'plaid': 66,
 'its': 67,
 'lapel': 68,
 'socks': 69,
 'shoes': 70,
 'suspenders': 71,
 'short': 72,
 't': 73,
 'shorts'

In [18]:
import sentencepiece as spm

# 将所有文本保存到一个文件中
with open('all_texts.txt', 'w') as f:
    for _, text_label in train_labels.items():
        f.write(text_label + '\n')

# 训练 SentencePiece 模型
spm.SentencePieceTrainer.train('--input=all_texts.txt --model_prefix=m --vocab_size=229')

# 加载模型
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# 使用模型
tokens = sp.encode_as_pieces('Example text to tokenize')
print(tokens)


['▁', 'Ex', 'a', 'm', 'p', 'le', '▁t', 'e', 'x', 't', '▁', 'to', '▁', 'to', 'k', 'e', 'n', 'i', 'z', 'e']


sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=all_texts.txt --model_prefix=m --vocab_size=229
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: all_texts.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 229
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
 

In [19]:
max_len = 0
with open('data/deepfashion-multimodal/train_captions_split.json', 'rb') as fr:
    train_labels_split = json.load(fr)
    p_bar = tqdm(train_labels_split)
    for _, text in p_bar:
        doc = nlp(text)
        if len(doc) > max_len:
            max_len = len(doc)
            p_bar.set_postfix(max_len=max_len)

100%|██████████| 58832/58832 [02:28<00:00, 397.46it/s, max_len=23]
