In [1]:
import collections
import os
import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import tqdm

os.environ["HF_DATASETS_CACHE"] = r"./data"

In [2]:
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"], cache_dir="./data")

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
train_data, train_data.features

(Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }),
 {'text': Value(dtype='string', id=None),
  'label': ClassLabel(names=['neg', 'pos'], id=None)})

In [4]:
train_data = train_data.select(range(5000))
test_data = test_data.select(range(5000))

In [7]:
nlp = spacy.load("en_core_web_sm")

def tokenize_text(data, tokenizer, max_length):
    tokens = [token.text for token in tokenizer(data["text"])][:max_length]
    return {"tokens": tokens}

In [8]:
max_length = 256

train_data = train_data.map(
    tokenize_text, fn_kwargs={"tokenizer": nlp, "max_length": max_length}
)
test_data = test_data.map(
    tokenize_text, fn_kwargs={"tokenizer": nlp, "max_length": max_length}
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [9]:
train_valid_data = train_data.train_test_split(test_size=0.2)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

In [10]:
from spacy.vocab import Vocab

vocab = Vocab(strings=["hello", "world"])

In [15]:
import spacy
from spacy.vocab import Vocab
from spacy.strings import StringStore

# 初始化 spaCy 的 Vocab 类
vocab = Vocab()

# 示例 token 列表
tokens = ["this", "is", "a", "sample", "sentence", "another", "example"]

# 将 token 添加到 Vocab 中
for token in tokens:
    _ = vocab.strings.add(token)  # 将词汇添加到 Vocab 中

# 使用 Vocab 类的 strings 属性来实现 word to index 和 index to word 映射
string_store = vocab.strings

# 打印词汇表
print("Word to Index:")
for token in tokens:
    print(f"Word '{token}' is mapped to index {string_store[token]}")

print("\nIndex to Word:")
for token in tokens:
    index = string_store[token]
    print(f"Index {index} is mapped to word '{string_store[index]}'")

Word to Index:
Word 'this' is mapped to index 1995909169258310477
Word 'is' is mapped to index 3411606890003347522
Word 'a' is mapped to index 11901859001352538922
Word 'sample' is mapped to index 10528961229637103608
Word 'sentence' is mapped to index 18108853898452662235
Word 'another' is mapped to index 7270490914741406701
Word 'example' is mapped to index 899618643364689362

Index to Word:
Index 1995909169258310477 is mapped to word 'this'
Index 3411606890003347522 is mapped to word 'is'
Index 11901859001352538922 is mapped to word 'a'
Index 10528961229637103608 is mapped to word 'sample'
Index 18108853898452662235 is mapped to word 'sentence'
Index 7270490914741406701 is mapped to word 'another'
Index 899618643364689362 is mapped to word 'example'
