<a href="https://colab.research.google.com/github/hufsaim/T10402201/blob/master/notebook/Lab07_preprocess_txt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Preprocessing

In [1]:
import random
import re
import collections
import torch

In [2]:
class Vocab:
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        self.unk, uniq_tokens = 0, [''] + reserved_tokens
        uniq_tokens += [
            token for token, freq in self.token_freqs
            if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1
    def __len__(self):
        return len(self.idx_to_token)
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

def count_corpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

def load_corpus_time_machine(max_tokens=-1):
    lines = read_time_machine()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

def seq_data_iter_random(corpus, batch_size, num_steps):
    corpus = corpus[random.randint(0, num_steps - 1):]
    num_subseqs = (len(corpus) - 1) // num_steps
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    random.shuffle(initial_indices)
    def data(pos):
        return corpus[pos:pos + num_steps]
    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i:i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)


In [3]:
def tokenize(lines, token='word'):
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('ERROR: unknown token type: ' + token)

임의의 txt 파일을 불러 옵니다.


In [4]:
path0 = 'timemachine.txt' # replace your own path
with open(path0, 'r') as f:
  lines = f.readlines()
lines
[re.sub('[^A-Za-z]+',' ',line).strip().lower() for line in lines]

print(f'# text lines: {len(lines)}')
print(lines[0])
print(lines[10])


# text lines: 3617
Project Gutenberg's The Time Machine, by H. G. (Herbert George) Wells

Author: H. G. (Herbert George) Wells



line별로 token으로 분리합니다.


In [5]:
tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

['Project', "Gutenberg's", 'The', 'Time', 'Machine,', 'by', 'H.', 'G.', '(Herbert', 'George)', 'Wells']
[]
['This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with']
['almost', 'no', 'restrictions', 'whatsoever.', 'You', 'may', 'copy', 'it,', 'give', 'it', 'away', 'or']
['re-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'Project', 'Gutenberg', 'License', 'included']
['with', 'this', 'eBook', 'or', 'online', 'at', 'www.gutenberg.net']
[]
[]
['Title:', 'The', 'Time', 'Machine']
[]
['Author:', 'H.', 'G.', '(Herbert', 'George)', 'Wells']


vocab을 생성합니다.


In [6]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('', 0), ('the', 1), ('of', 2), ('and', 3), ('I', 4), ('a', 5), ('to', 6), ('in', 7), ('was', 8), ('my', 9)]


In [7]:
for i in [0, 10]:
    print('words:', tokens[i])
    print('indices:', vocab[tokens[i]])

words: ['Project', "Gutenberg's", 'The', 'Time', 'Machine,', 'by', 'H.', 'G.', '(Herbert', 'George)', 'Wells']
indices: [50, 1635, 17, 33, 298, 29, 867, 868, 1152, 1153, 869]
words: ['Author:', 'H.', 'G.', '(Herbert', 'George)', 'Wells']
indices: [2771, 867, 868, 1152, 1153, 869]


In [8]:
max_tokens = -1
corpus = [vocab[token] for line in tokens for token in line]
if max_tokens > 0:
  corpus = corpus[:max_tokens]

len(corpus), len(vocab)

(35319, 7636)

딥러닝모델의 학습에 활용하기 위한 data loader가 제대로 작동하는지 확인합니다.


In [9]:
for X, Y in seq_data_iter_random(corpus[:14], batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  tensor([[  33,  298,   29,  867,  868],
        [1152, 1153,  869,  219,  703]]) 
Y: tensor([[ 298,   29,  867,  868, 1152],
        [1153,  869,  219,  703,   27]])


vocab을 통하여, 빈도수 기준 상위 10개의 token을 확인합니다.


In [10]:
for v in range(0,10):
  print(vocab.idx_to_token[v])


the
of
and
I
a
to
in
was
my
