In [1]:
import os
import pickle
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import torch
import shutil
import numpy as np
import warnings

# UserWarning 무시
warnings.simplefilter("ignore", UserWarning)

In [2]:
root_path = "{workspace}/my_project/src/data_origin/transcription"
filenames = ["transcription_training.pkl", "transcription_validation.pkl", "transcription_test.pkl"]

In [3]:
data = {}
for filename in filenames:
    with open(os.path.join(root_path, filename), 'rb') as file:
        data_tmp = pickle.load(file)
        print(filename, len(data_tmp))
    data.update(data_tmp)
print(len(data))

transcription_training.pkl 6000
transcription_validation.pkl 2000
transcription_test.pkl 2000
10000


### tokenizer and model preparation

In [4]:
# tokenizer model preparation
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
# 한 개의 단어를 768차원으로 표현
# 최대 sequence length 512

### make bert embeddings

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

folder_path = "./bert_embeddings/"
if os.path.exists(folder_path):
    shutil.rmtree(folder_path)
os.makedirs(folder_path, exist_ok=True)

In [6]:
with torch.no_grad():
    for key, value in tqdm(data.items()):
        inputs = tokenizer(value, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        outputs = model(**inputs)
        pooler_output = outputs.pooler_output
        pooler_output_numpy = pooler_output.cpu().numpy()
        
        save_filename = os.path.join(folder_path, key + ".npy")
        np.save(save_filename, pooler_output_numpy)

100%|██████████| 10000/10000 [02:06<00:00, 78.98it/s]
