In [16]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab


In [17]:
texts = [
    "I love programming in Python",
    "Python is a great language",
    "Programming is fun"
]


In [18]:
tokenizer = get_tokenizer('basic_english')
tokens = [tokenizer(text) for text in texts]


In [19]:
counter = Counter()
for token_list in tokens:
    counter.update(token_list)


In [20]:
counter

Counter({'programming': 2,
         'python': 2,
         'is': 2,
         'i': 1,
         'love': 1,
         'in': 1,
         'a': 1,
         'great': 1,
         'language': 1,
         'fun': 1})

In [21]:
vocab = vocab(counter, min_freq=1, specials=['<unk>', '<pad>'])


In [23]:
# 查找词汇索引
word_index = vocab['python']
print(f"'Python' 的索引为: {word_index}")

# 将词汇列表转换为索引列表
text = "Python is a great language"
tokenized_text = tokenizer(text)
indexed_text = [vocab[token] for token in tokenized_text]
print(f"分词后的索引列表: {indexed_text}")


'Python' 的索引为: 6
分词后的索引列表: [6, 7, 8, 9, 10]


In [8]:
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from torch.utils.data import Dataset, DataLoader

# 定义分词器
tokenizer = get_tokenizer('basic_english')

# 示例数据
texts = [
    "I love programming in Python",
    "Python is a great language",
    "Programming is fun"
]
labels = [1, 1, 0]  # 假设 1 表示正面评价，0 表示负面评价

# 分词并统计词频
tokens = [tokenizer(text) for text in texts]
counter = Counter()
for token_list in tokens:
    counter.update(token_list)

# 定义词汇表
special_tokens = ['<unk>', '<pad>']
vocab = Vocab(counter, min_freq=1, specials=special_tokens, special_first=True)

# 定义数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, vocab, max_length=10):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # 分词并转换为索引
        tokenized_text = self.tokenizer(text)
        indexed_text = [self.vocab[token] if token in self.vocab else self.vocab['<unk>'] for token in tokenized_text]
        
        # 填充或截断
        if len(indexed_text) < self.max_length:
            indexed_text += [self.vocab['<pad>']] * (self.max_length - len(indexed_text))
        else:
            indexed_text = indexed_text[:self.max_length]
        
        return torch.tensor(indexed_text, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# 创建数据集
dataset = TextDataset(texts, labels, tokenizer, vocab)

# 创建数据加载器
batch_size = 2
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 打印数据集中的一个批次
for batch in data_loader:
    inputs, targets = batch
    print("Inputs:", inputs)
    print("Targets:", targets)
    break

# 打印词汇表
print("Vocabulary size:", len(vocab))
print("Vocabulary:", vocab)


TypeError: Vocab.__init__() got an unexpected keyword argument 'min_freq'