In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
from typing import Counter
from tqdm import tqdm

In [10]:
# link: http://www.manythings.org/anki/
# 这里使用的是 中英翻译
df = pd.read_csv("./tatoeba_dataset/cmn.txt", sep='\t', header=None)
df[['eng', 'zht']] = df[[0, 1]]
df = df[['eng', 'zht']]
df

Unnamed: 0,eng,zht
0,Hi.,嗨。
1,Hi.,你好。
2,Run.,你用跑的。
3,Stay.,待著。
4,Stay.,且慢。
...,...,...
30914,"If you don't want to put on sunscreen, that's ...",你不想涂防晒霜是你的问题，但是晒伤了不要来抱怨。
30915,"Even now, I occasionally think I'd like to see...",即使是现在，我偶尔还是想见到你。不是今天的你，而是我记忆中曾经的你。
30916,It's very easy to sound natural in your own na...,你很容易把母语说得通顺流畅，却很容易把非母语说得不自然。
30917,"I got fired from the company, but since I have...",虽然我被公司解雇了，但是我还有点存款，所以目前不用担心生计问题。


In [11]:
sos_token = "<sos>"
eos_token = "<eos>"
unk_token = "<unk>"
pad_token = "<pad>"


class Vocab:
    # 定义特殊词元
    def __init__(self, counter, min_freq=1, specials=None):
        if specials is None:
            specials = []

        self.itos = specials[:]  # 索引到字符串的映射 (index-to-string)
        self.stoi = {
            token: i for i, token in enumerate(self.itos)
        }  # 字符串到索引的映射 (string-to-index)

        # 按频率排序，频率高的词排在前面
        sorted_by_freq = sorted(counter.items(), key=lambda x: x[1], reverse=True)

        for word, freq in sorted_by_freq:
            if freq >= min_freq:
                if word not in self.stoi:
                    self.itos.append(word)
                    self.stoi[word] = len(self.itos) - 1

        # 如果<unk>在specials里，设置默认值
        self.unk_index = self.stoi.get(unk_token)

    def __len__(self):
        return len(self.itos)

    def __getitem__(self, token):
        # 如果词存在，返回其索引；否则返回<unk>的索引
        return self.stoi.get(token, self.unk_index)

In [None]:
from typing import Tuple
from pandas import DataFrame, Series
from torch import Tensor
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader


class CMNDataset(Dataset):
    def __init__(self, df_: DataFrame) -> None:
        super(CMNDataset, self).__init__()
        self.df = df_
        self.df = df_.map(CMNDataset.preprocess_nmt)
        self.df["eng_tok"] = self.df["eng"].map(lambda text: text.split())
        self.df["zht_tok"] = self.df["zht"].map(lambda text: [ch for ch in text])
        self.df['eng_ids'], self.eng_vocab = self.tokenize_id(self.df['eng_tok'])
        self.df['zht_ids'], self.zht_vocab = self.tokenize_id(self.df['zht_tok'])
        # self.df = self.tokenize_id(self.df)  # 创建数字形式的token

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, index) -> Tuple[Tensor, Tensor]:
        eng = self.df["eng_ids"][index]
        zht = self.df["zht_ids"][index]
        return torch.tensor(eng, dtype=torch.int), torch.tensor(zht, dtype=torch.int)
    
    # def itos(self, )

    @staticmethod
    def preprocess_nmt(text: str):
        """分词前，预处理“英语－中文”数据集"""

        def no_space(char, prev_char):
            return char in set(",.!?") and prev_char != " "

        # 使用空格替换不间断空格
        # 使用小写字母替换大写字母
        text = text.replace("\u202f", " ").replace("\xa0", " ").lower()
        # 在单词和标点符号之间插入空格
        out = [
            " " + char if i > 0 and no_space(char, text[i - 1]) else char
            for i, char in enumerate(text)
        ]
        return "".join(out)

    @staticmethod
    def tokenize_id(df: Series):
        # df = df_.copy(True)
        # 1. token id
        counter = Counter()
        for example in tqdm(df):
            counter.update(example)
        vocab = Vocab(
            counter,
            min_freq=1,
            specials=[unk_token, pad_token, sos_token, eos_token],
        )
        # 补全、对齐token长度
        max_len = df.map(lambda x: len(x)).max()
        df = df.map(
            lambda arr: [sos_token]
            + arr
            + [eos_token]
            + [pad_token] * (max_len - len(arr))
        )
        df = df.map(
            lambda arr: [vocab.__getitem__(x) for x in arr]
        )

        return df, vocab


dataset = CMNDataset(df)
print(dataset.df.head(1))
for x, y in dataset:
    print(x)
    print(y)
    break

100%|██████████| 30919/30919 [00:00<00:00, 1870662.61it/s]
100%|██████████| 30919/30919 [00:00<00:00, 1585898.59it/s]


    eng zht  eng_tok zht_tok  \
0  hi .  嗨。  [hi, .]  [嗨, 。]   

                                             eng_ids  \
0  [2, 1485, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   

                                             zht_ids  
0  [2, 2143, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
tensor([   2, 1485,    4,    3,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1],
       dtype=torch.int32)
tensor([   2, 2143,    4,    3,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1],
       dtype=torch.int32)


In [None]:
# 词元化“英语-中文”数据Dataframe

tokenize_eng = lambda text: [sos_token] + text.split(' ') + [eos_token]
tokenize_zht = lambda text: [sos_token] + [str(ch) for ch in text] + [eos_token]

df['eng_tok'] = df['eng'].map(tokenize_eng)
df['zht_tok'] = df['zht'].map(tokenize_zht)

df


Unnamed: 0,eng,zht,eng_tok,zht_tok
0,Hi.,嗨。,"[<sos>, Hi., <eos>]","[<sos>, 嗨, 。, <eos>]"
1,Hi.,你好。,"[<sos>, Hi., <eos>]","[<sos>, 你, 好, 。, <eos>]"
2,Run.,你用跑的。,"[<sos>, Run., <eos>]","[<sos>, 你, 用, 跑, 的, 。, <eos>]"
3,Stay.,待著。,"[<sos>, Stay., <eos>]","[<sos>, 待, 著, 。, <eos>]"
4,Stay.,且慢。,"[<sos>, Stay., <eos>]","[<sos>, 且, 慢, 。, <eos>]"
...,...,...,...,...
30914,"If you don't want to put on sunscreen, that's ...",你不想涂防晒霜是你的问题，但是晒伤了不要来抱怨。,"[<sos>, If, you, don't, want, to, put, on, sun...","[<sos>, 你, 不, 想, 涂, 防, 晒, 霜, 是, 你, 的, 问, 题, ，,..."
30915,"Even now, I occasionally think I'd like to see...",即使是现在，我偶尔还是想见到你。不是今天的你，而是我记忆中曾经的你。,"[<sos>, Even, now,, I, occasionally, think, I'...","[<sos>, 即, 使, 是, 现, 在, ，, 我, 偶, 尔, 还, 是, 想, 见,..."
30916,It's very easy to sound natural in your own na...,你很容易把母语说得通顺流畅，却很容易把非母语说得不自然。,"[<sos>, It's, very, easy, to, sound, natural, ...","[<sos>, 你, 很, 容, 易, 把, 母, 语, 说, 得, 通, 顺, 流, 畅,..."
30917,"I got fired from the company, but since I have...",虽然我被公司解雇了，但是我还有点存款，所以目前不用担心生计问题。,"[<sos>, I, got, fired, from, the, company,, bu...","[<sos>, 虽, 然, 我, 被, 公, 司, 解, 雇, 了, ，, 但, 是, 我,..."


In [None]:
counter = Counter()

for example in tqdm(df["eng_tok"]):
    counter.update(example)
vocab = Vocab(
    counter, min_freq=1, specials=[unk_token, pad_token, sos_token, eos_token]
)


print(f"English vocabulary size: {len(vocab)}")
print("Top 10 English tokens:", vocab.itos[:30])
print(vocab.stoi["the"])


zh_counter = Counter()

for example in tqdm(df["zht_tok"]):
    zh_counter.update(example)
# print(zh_counter)
zh_vocab = Vocab(
    zh_counter, min_freq=1, specials=[unk_token, pad_token, sos_token, eos_token]
)


print(f"Chinese vocabulary size: {len(zh_vocab)}")
print("Top 10 Chinese tokens:", zh_vocab.itos[:30])
print(zh_vocab.stoi["我"])

100%|██████████| 30919/30919 [00:00<00:00, 1677190.00it/s]


English vocabulary size: 12827
Top 10 English tokens: ['<unk>', '<pad>', '<sos>', '<eos>', 'I', 'to', 'the', 'a', 'you', 'is', 'Tom', 'in', 'of', 'He', 'was', 'have', 'The', 'for', 'that', 'my', 'She', 'your', "I'm", 'me', 'do', 'on', 'at', 'are', "don't", 'like']
6


100%|██████████| 30919/30919 [00:00<00:00, 1057977.32it/s]

Chinese vocabulary size: 3685
Top 10 Chinese tokens: ['<unk>', '<pad>', '<sos>', '<eos>', '。', '我', '的', '了', '你', '他', '不', '是', '？', '一', '在', '姆', '有', '汤', '她', '，', '要', '很', '这', '人', '這', '想', '到', '天', '們', '什']
5





In [None]:
# 词元化“英语-中文”数据Dataframe

token_id_eng = lambda arr: [vocab.__getitem__(x) for x in arr]
token_id_zht = lambda arr: [zh_vocab.stoi.__getitem__(x) for x in arr]

df['eng_ids'] = df['eng_tok'].map(token_id_eng)
df['zht_ids'] = df['zht_tok'].map(token_id_zht)

df

Unnamed: 0,eng,zht,eng_tok,zht_tok,eng_ids,zht_ids
0,Hi.,嗨。,"[<sos>, Hi., <eos>]","[<sos>, 嗨, 。, <eos>]","[2, 5063, 3]","[2, 2145, 4, 3]"
1,Hi.,你好。,"[<sos>, Hi., <eos>]","[<sos>, 你, 好, 。, <eos>]","[2, 5063, 3]","[2, 8, 37, 4, 3]"
2,Run.,你用跑的。,"[<sos>, Run., <eos>]","[<sos>, 你, 用, 跑, 的, 。, <eos>]","[2, 7134, 3]","[2, 8, 96, 440, 6, 4, 3]"
3,Stay.,待著。,"[<sos>, Stay., <eos>]","[<sos>, 待, 著, 。, <eos>]","[2, 5064, 3]","[2, 253, 185, 4, 3]"
4,Stay.,且慢。,"[<sos>, Stay., <eos>]","[<sos>, 且, 慢, 。, <eos>]","[2, 5064, 3]","[2, 1153, 599, 4, 3]"
...,...,...,...,...,...,...
30914,"If you don't want to put on sunscreen, that's ...",你不想涂防晒霜是你的问题，但是晒伤了不要来抱怨。,"[<sos>, If, you, don't, want, to, put, on, sun...","[<sos>, 你, 不, 想, 涂, 防, 晒, 霜, 是, 你, 的, 问, 题, ，,...","[2, 166, 8, 28, 41, 5, 195, 25, 12819, 852, 21...","[2, 8, 10, 25, 2367, 1307, 2044, 2459, 11, 8, ..."
30915,"Even now, I occasionally think I'd like to see...",即使是现在，我偶尔还是想见到你。不是今天的你，而是我记忆中曾经的你。,"[<sos>, Even, now,, I, occasionally, think, I'...","[<sos>, 即, 使, 是, 现, 在, ，, 我, 偶, 尔, 还, 是, 想, 见,...","[2, 857, 1983, 4, 3726, 56, 137, 29, 5, 95, 53...","[2, 1054, 437, 11, 179, 14, 19, 5, 947, 1000, ..."
30916,It's very easy to sound natural in your own na...,你很容易把母语说得通顺流畅，却很容易把非母语说得不自然。,"[<sos>, It's, very, easy, to, sound, natural, ...","[<sos>, 你, 很, 容, 易, 把, 母, 语, 说, 得, 通, 顺, 流, 畅,...","[2, 51, 44, 397, 5, 1024, 1494, 11, 21, 486, 1...","[2, 8, 21, 426, 441, 62, 245, 188, 67, 45, 318..."
30917,"I got fired from the company, but since I have...",虽然我被公司解雇了，但是我还有点存款，所以目前不用担心生计问题。,"[<sos>, I, got, fired, from, the, company,, bu...","[<sos>, 虽, 然, 我, 被, 公, 司, 解, 雇, 了, ，, 但, 是, 我,...","[2, 4, 78, 4452, 65, 6, 12824, 111, 652, 4, 15...","[2, 1553, 272, 5, 118, 163, 538, 255, 1779, 7,..."


In [None]:
# 对齐长度
