In [1]:
# ! export http_proxy=http://10.12.44.139:7890
# ! export https_proxy=http://10.12.44.139:7890


In [2]:
import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

# 示例数据
texts = [
    "I love programming in Python",
    "Python is a great language",
    "Programming is fun"
]
labels = [1, 1, 0]  # 假设 1 表示正面评价，0 表示负面评价

# 加载预训练的分词器
tokenizer = AutoTokenizer.from_pretrained(
    "../.pretrained_models/bert-base-cased",
    clean_up_tokenization_spaces=True
)


# 定义数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=10):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # 使用分词器进行分词和编码
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return input_ids, attention_mask, torch.tensor(label, dtype=torch.float)


# 创建数据集
dataset = TextDataset(texts, labels, tokenizer)

# 创建数据加载器
batch_size = 2
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 打印数据集中的一个批次
for batch in data_loader:
    input_ids, attention_mask, targets = batch
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Targets:", targets)
    break

# 打印分词器的词汇表大小
print("Vocabulary size:", len(tokenizer))


Input IDs: tensor([[  101, 21076,  1110,  4106,   102,     0,     0,     0,     0,     0],
        [  101, 23334,  1110,   170,  1632,  1846,   102,     0,     0,     0]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])
Targets: tensor([0., 1.])
Vocabulary size: 28996


In [3]:
tokenizer

BertTokenizerFast(name_or_path='../.pretrained_models/bert-base-cased', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [4]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_bos_token',
 '_call_one',
 '_cls_token',
 '_compile_jinja_template',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_mask_token',
 '

In [7]:
tokenizer.vocab.__len__()

28996

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    "../.pretrained_models/bert-base-chinese",
    clean_up_tokenization_spaces=True
)

In [11]:
tokenizer.vocab_size

21128

In [12]:
tokenizer = AutoTokenizer.from_pretrained(
    "../.pretrained_models/bert-base-german-cased",
    clean_up_tokenization_spaces=True
)

In [13]:
tokenizer.vocab

{'Beschwerde': 2243,
 '##läuf': 2270,
 '##unsten': 5403,
 'rechtswidrig': 9453,
 '##weisung': 4599,
 'Vorstand': 6287,
 '190': 1910,
 'Boy': 20187,
 'Antragsteller': 3384,
 'genügt': 8612,
 '[unused2774]': 29773,
 '##ilian': 8718,
 'bestellte': 25313,
 '[unused421]': 27420,
 'sach': 4965,
 'spezialisierte': 24583,
 'sorgen': 9103,
 '##abel': 3549,
 'ruhig': 17867,
 'nacheinander': 22363,
 'Wuchs': 26745,
 '##Prof': 13162,
 'verspä': 21065,
 '##cuador': 25992,
 'emerit': 22617,
 'Betätigung': 26609,
 '[unused2625]': 29624,
 '[unused643]': 27642,
 '##ikel': 3868,
 'Bewerbung': 18543,
 'Wolff': 19292,
 'Doch': 1679,
 'inform': 5867,
 'gefährdet': 11110,
 'hohen': 3661,
 'Cav': 19988,
 '##lagern': 10191,
 'werde': 1631,
 'Fürstentum': 23390,
 '[unused2702]': 29701,
 'Roma': 18487,
 'Wechsel': 4772,
 'Folgezeit': 12952,
 '##strei': 24495,
 '##oldung': 22103,
 'grün': 2559,
 '##Bahn': 5289,
 'erscheinen': 6339,
 'pf': 6699,
 '##folger': 3199,
 'Wohngebiet': 25157,
 '##100': 26547,
 'Kant': 3

In [14]:
tokenizer.vocab_size

30000

In [15]:
tokenizer.vocab_files_names

{'vocab_file': 'vocab.txt', 'tokenizer_file': 'tokenizer.json'}

In [16]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_bos_token',
 '_call_one',
 '_cls_token',
 '_compile_jinja_template',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_mask_token',
 '

In [17]:
tokenizer.verbose

False

In [22]:
import torch
from transformers import AutoTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader

# 示例数据
texts = [
    "I love programming in Python",
    "Python is a great language",
    "Programming is fun"
]
labels = [1, 1, 0]  # 假设 1 表示正面评价，0 表示负面评价

# 加载预训练的分词器和模型
tokenizer = AutoTokenizer.from_pretrained("../.pretrained_models/bert-base-cased")
model = BertModel.from_pretrained("../.pretrained_models/bert-base-cased")


# 定义数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=10):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # 使用分词器进行分词和编码
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return input_ids, attention_mask, torch.tensor(label, dtype=torch.float)


# 创建数据集
dataset = TextDataset(texts, labels, tokenizer)

# 创建数据加载器
batch_size = 2
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 打印数据集中的一个批次
for batch in data_loader:
    input_ids, attention_mask, targets = batch
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Targets:", targets)
    break


# 获取句子的嵌入表示
def get_sentence_embeddings(texts, tokenizer, model):
    # 分词和编码
    encoding = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # 获取模型输出
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # 取平均作为句子嵌入

    return embeddings


# 获取示例文本的嵌入表示
sentence_embeddings = get_sentence_embeddings(texts, tokenizer, model)
print("Sentence Embeddings:")
print(sentence_embeddings)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Input IDs: tensor([[  101, 21076,  1110,  4106,   102,     0,     0,     0,     0,     0],
        [  101,   146,  1567,  4159,  1107, 23334,   102,     0,     0,     0]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])
Targets: tensor([0., 1.])
{'input_ids': tensor([[  101,   146,  1567,  4159,  1107, 23334,   102],
        [  101, 23334,  1110,   170,  1632,  1846,   102],
        [  101, 21076,  1110,  4106,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0]])}
Sentence Embeddings:
tensor([[ 0.4559,  0.1651, -0.2481,  ...,  0.2512,  0.1600,  0.2613],
        [ 0.6230,  0.0513, -0.2029,  ...,  0.1886,  0.0581,  0.2735],
        [ 0.4072,  0.1323,  0.2236,  ...,  0.0941,  0.3853,  0.3177]])


In [20]:
sentence_embeddings

tensor([[ 0.4559,  0.1651, -0.2481,  ...,  0.2512,  0.1600,  0.2613],
        [ 0.6230,  0.0513, -0.2029,  ...,  0.1886,  0.0581,  0.2735],
        [ 0.4072,  0.1323,  0.2236,  ...,  0.0941,  0.3853,  0.3177]])

In [21]:
sentence_embeddings.shape

torch.Size([3, 768])

In [27]:
import torch
from transformers import AutoTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader

# 示例数据
texts = [
    "I love programming in Python",
    "Python is a great language",
    "Programming is fun"
]
labels = [1, 1, 0]  # 假设 1 表示正面评价，0 表示负面评价

# 加载预训练的分词器和模型
tokenizer = AutoTokenizer.from_pretrained("../.pretrained_models/bert-base-cased")
model = BertModel.from_pretrained("../.pretrained_models/bert-base-cased")


# 定义数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=10):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # 使用分词器进行分词和编码
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return input_ids, attention_mask, torch.tensor(label, dtype=torch.float)


# 创建数据集
dataset = TextDataset(texts, labels, tokenizer)

# 创建数据加载器
batch_size = 2
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 打印数据集中的一个批次
for batch in data_loader:
    input_ids, attention_mask, targets = batch
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Targets:", targets)
    break


# 获取句子的词嵌入表示
def get_word_embeddings(texts, tokenizer, model):
    # 分词和编码
    encoding = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # 获取模型输出
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

    # 返回词嵌入表示
    return last_hidden_state


# 获取示例文本的词嵌入表示
word_embeddings = get_word_embeddings(texts, tokenizer, model)
print("Word Embeddings:")
print(word_embeddings)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Input IDs: tensor([[  101, 23334,  1110,   170,  1632,  1846,   102,     0,     0,     0],
        [  101,   146,  1567,  4159,  1107, 23334,   102,     0,     0,     0]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])
Targets: tensor([1., 1.])
Word Embeddings:
tensor([[[ 6.7468e-01,  6.9297e-02, -3.0896e-01,  ...,  9.4618e-02,
           5.6142e-01,  1.9888e-01],
         [ 3.1860e-01,  8.9978e-02,  2.0622e-01,  ...,  1.7626e-01,
          -6.3275e-02,  5.5940e-01],
         [ 1.3464e-01, -3.9309e-03, -4.9525e-01,  ...,  7.4246e-01,
          -6.7457e-01,  2.4827e-01],
         ...,
         [ 1.3741e-01, -1.6979e-01,  1.4357e-02,  ...,  2.7694e-01,
           2.7418e-01, -2.8970e-02],
         [ 4.4751e-01,  1.5007e-01, -5.0931e-01,  ..., -2.3448e-02,
           3.8850e-01,  2.9415e-01],
         [ 1.1601e+00,  4.0479e-01, -7.0263e-01,  ...,  1.0551e-01,
           9.0991e-01,  1.1078e-01]],

        [[ 6.6111e-01, -4.5665e-02, -2.372

In [25]:
word_embeddings.shape

torch.Size([3, 7, 768])

In [28]:
import transformers
dir(transformers)

['ASTConfig',
 'ASTFeatureExtractor',
 'ASTForAudioClassification',
 'ASTModel',
 'ASTPreTrainedModel',
 'Adafactor',
 'AdamW',
 'AdamWeightDecay',
 'AdaptiveEmbedding',
 'AddedToken',
 'Agent',
 'AlbertConfig',
 'AlbertForMaskedLM',
 'AlbertForMultipleChoice',
 'AlbertForPreTraining',
 'AlbertForQuestionAnswering',
 'AlbertForSequenceClassification',
 'AlbertForTokenClassification',
 'AlbertModel',
 'AlbertPreTrainedModel',
 'AlbertTokenizer',
 'AlbertTokenizerFast',
 'AlignConfig',
 'AlignModel',
 'AlignPreTrainedModel',
 'AlignProcessor',
 'AlignTextConfig',
 'AlignTextModel',
 'AlignVisionConfig',
 'AlignVisionModel',
 'AltCLIPConfig',
 'AltCLIPModel',
 'AltCLIPPreTrainedModel',
 'AltCLIPProcessor',
 'AltCLIPTextConfig',
 'AltCLIPTextModel',
 'AltCLIPVisionConfig',
 'AltCLIPVisionModel',
 'AlternatingCodebooksLogitsProcessor',
 'AqlmConfig',
 'AudioClassificationPipeline',
 'AutoBackbone',
 'AutoConfig',
 'AutoFeatureExtractor',
 'AutoImageProcessor',
 'AutoModel',
 'AutoModelForAu

In [35]:
import torch
from PIL import Image
from transformers import CLIPTokenizer, CLIPModel, CLIPProcessor

# 示例文本和图像
texts = ["a photo of a cat", "a photo of a dog"]
image_path = "SCR-20240914-scuf.png"

# 加载预训练的CLIP模型和分词器
tokenizer = CLIPTokenizer.from_pretrained("../.pretrained_models/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("../.pretrained_models/clip-vit-base-patch32")

# 加载图像
processor = CLIPProcessor.from_pretrained("../.pretrained_models/clip-vit-base-patch32")
image = Image.open(image_path)

# 处理文本和图像
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)

# 获取模型输出
with torch.no_grad():
    outputs = model(**inputs)

# 提取文本和图像的嵌入表示
text_embeddings = outputs.text_embeds
image_embeddings = outputs.image_embeds

# 计算相似度
cosine_similarities = torch.nn.CosineSimilarity(dim=-1)(text_embeddings, image_embeddings)

# 打印相似度
print("Cosine Similarities:")
print(cosine_similarities)

# 打印文本和图像的嵌入表示
print("Text Embeddings:")
print(text_embeddings)
print("Image Embeddings:")
print(image_embeddings)


Cosine Similarities:
tensor([0.2331, 0.1988])
Text Embeddings:
tensor([[ 0.0148,  0.0070, -0.0234,  ..., -0.0508, -0.0438,  0.0033],
        [ 0.0087,  0.0258, -0.0387,  ..., -0.0547, -0.0242,  0.0112]])
Image Embeddings:
tensor([[-3.1646e-02, -4.4683e-02, -5.3537e-02, -4.1083e-02,  3.5067e-02,
         -2.9017e-02, -6.0700e-03,  1.1640e-02,  1.1509e-02, -9.0606e-03,
          5.8737e-02, -3.6815e-02,  1.6985e-02, -1.1781e-02, -2.3312e-03,
          1.3155e-02, -9.9854e-03, -6.1771e-03,  1.5439e-02,  2.9156e-02,
          2.3821e-02,  3.6498e-02,  4.3254e-02, -9.0280e-03, -9.9071e-03,
          6.5401e-03, -6.7367e-03, -3.3519e-02, -1.4676e-02, -1.4133e-02,
          5.9268e-03,  5.9447e-03,  3.1212e-03,  2.6177e-02,  3.6211e-02,
         -1.3909e-02,  3.3059e-02, -1.4424e-02,  7.4597e-03, -1.5723e-01,
          1.8544e-02, -8.7091e-03, -1.2254e-02, -5.2413e-02, -9.6742e-03,
          7.1011e-02,  2.5645e-02,  1.9217e-02,  1.8643e-02, -4.0501e-02,
          2.5664e-02,  5.4755e-02,  8.

In [32]:
text_embeddings.shape

torch.Size([2, 512])

In [33]:
image_embeddings.shape

torch.Size([1, 512])