<a href="https://colab.research.google.com/github/kimhwijin/TensorflowWithKeras/blob/master/RNN/RNN_GRU_POS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#품사 테그
#Penn Treebank 의 테그된 품사 dataset의 10%를 사용한다.
#입력 : 단어 시퀀스, 출력 : 단어에 맞는 품사의 시퀀스

In [3]:
import nltk
nltk.download("treebank")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [4]:
import os
import numpy as np
import tensorflow as tf
import shutil

In [11]:
def download_and_read(dataset_dir, num_pairs=None):
  sent_filename = os.path.join(dataset_dir, "treebank-sents.txt")
  poss_filename = os.path.join(dataset_dir, "treebank-poss.txt")
  print(poss_filename)
  if not(os.path.exists(sent_filename) and os.path.exists(poss_filename)):
    import nltk
    if not os.path.exists(dataset_dir):
      os.makedirs(dataset_dir)
    fsents = open(sent_filename, "w")
    fposs = open(poss_filename, "w")
    #tag된 문장 목록
    sentences = nltk.corpus.treebank.tagged_sents()
    for sent in sentences:
      fsents.write(" ".join([w for w, p in sent]) + "\n")
      fposs.write(" ".join([p for w, p in sent]) + "\n")
    
    fsents.close()
    fposs.close()
  
  sents, poss = [], []
  with open(sent_filename, "r") as fsent:
    for idx, line in enumerate(fsent):
      sents.append(line.strip())
      if num_pairs is not None and idx >= num_pairs:
        break
  with open(poss_filename, "r") as fposs:
    for idx, line in enumerate(fposs):
      poss.append(line.strip())
      if num_pairs is not None and idx >= num_pairs:
        break
  
  return sents, poss

DATASET_DIR = "drive/MyDrive/Datasets/treebank_pos"
sents, poss = download_and_read(DATASET_DIR)
assert(len(sents) == len(poss))
print("# of records : {:d}".format(len(sents)))

drive/MyDrive/Datasets/treebank_pos/treebank-poss.txt
# of records : 3914


In [18]:
def tokenize_and_build_vocab(texts, vocab_size=None, lower=True):
  if vocab_size is None:
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=lower)
  else:
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size + 1, oov_token="UNK", lower=lower)
  
  tokenizer.fit_on_texts(texts)
  if vocab_size is not None:
    tokenizer.word_index = {e:i for e, i in tokenizer.word_index.items() if i <= vocab_size + 1}
  
  word2idx = tokenizer.word_index
  idx2word = {v:k for k,v in word2idx.items()}
  return word2idx, idx2word, tokenizer

word2idx_s, idx2word_s, tokenizer_s = tokenize_and_build_vocab(sents, 9927)
word2idx_t, idx2word_t, tokenizer_t = tokenize_and_build_vocab(poss, vocab_size=38, lower=False)
source_vocab_size = len(word2idx_s)
target_vocab_size = len(word2idx_t)

print("vocab size (source): {:d}, (target) : {:d}".format(source_vocab_size, target_vocab_size))

vocab size (source): 9928, (target) : 39


In [21]:
seq_lengths = np.array([len(s.split()) for s in sents])
print([(p, np.percentile(seq_lengths, p)) for p in [75, 80, 90, 95, 99, 99.9, 100]])


[(75, 33.0), (80, 35.0), (90, 41.0), (95, 47.0), (99, 58.0), (99.9, 93.56600000000799), (100, 271.0)]


In [25]:
max_seqlen = 271
sents_as_ints = tokenizer_s.texts_to_sequences(sents)
sents_as_ints = tf.keras.preprocessing.sequence.pad_sequences(sents_as_ints, maxlen=max_seqlen, padding="post")
poss_as_ints = tokenizer_t.texts_to_sequences(poss)
poss_as_ints = tf.keras.preprocessing.sequence.pad_sequences(poss_as_ints, maxlen=max_seqlen, padding="post")

poss_as_catints = []
for p in poss_as_ints:
  poss_as_catints.append(tf.keras.utils.to_categorical(p, num_classes=target_vocab_size+1, dtype="int32"))
poss_as_catints = tf.keras.preprocessing.sequence.pad_sequences(poss_as_catints, maxlen=max_seqlen)
print(poss_as_catints.shape)

dataset = tf.data.Dataset.from_tensor_slices((sents_as_ints, poss_as_catints))

idx2word_s[0], idx2word_t[0] = "PAD", "PAD"

dataset = dataset.shuffle(10000)
test_size = len(sents) // 3
val_size = (len(sents) - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

batch_size = 128
test_dataset = test_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
train_dataset = train_dataset.batch(batch_size)

(3914, 271, 40)
