# 斷詞

## 英文斷詞

In [4]:
# 英文斷詞
from tensorflow.keras.preprocessing.text import text_to_word_sequence
print("英文斷詞：", text_to_word_sequence("I love jogging, and you?"))

英文斷詞： ['i', 'love', 'jogging', 'and', 'you']


## 中文斷詞

In [5]:
# Install jieba（結巴）
!pip install jieba

# Get the Tokenization Dictionary for Traditional Chinese
import os
Dictionary_File = 'dict.txt.big'

if not os.path.isfile(Dictionary_File):
    os.system('wget https://raw.githubusercontent.com/cnchi/datasets/master/' + Dictionary_File)

# Get the Stop Words File for Traditional Chinese
StopWords_File = "stopWords_big5.txt"

if not os.path.isfile(StopWords_File):
    os.system('wget https://raw.githubusercontent.com/cnchi/datasets/master/' + StopWords_File)



In [6]:
import jieba

# Set Dictionary for Traditional Chinese
# jieba.set_dictionary(Dictionary_File)

# Tokenization
result = list(jieba.cut("我喜歡跑步，你呢？"))
print("中文斷詞（有標點）：", result)

# Remove Stop Words from Set
stopWords = set("$!&#%\()+-*/_,. 　?:;'\"<=>^`|~[]{}’0123456789?_“”、。《》！，：；？「」（）")
print("中文斷詞（無標點）：", [word for word in result if word not in stopWords])

# Remove Stop Words from Files
stopWords = set()
with open(StopWords_File, "rt", encoding="utf-8") as f:
  for line in f:
    line = line.strip() # Remove trailing \n
    stopWords.add(line)
print("中文斷詞（更精簡）：", [word for word in result if word not in stopWords])

中文斷詞（有標點）： ['我', '喜歡', '跑步', '，', '你', '呢', '？']
中文斷詞（無標點）： ['我', '喜歡', '跑步', '你', '呢']
中文斷詞（更精簡）： ['喜歡', '跑步']


# 文字數位化

In [7]:
# Create a Tokenizer object
from tensorflow.keras.preprocessing.text import Tokenizer

tk = Tokenizer(
        num_words=None,
        filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n',
        lower=True,
        split=' ',
        char_level=False,
        oov_token='NiD'
    )

In [8]:
# Create Mapping by Corpus
corpus = ["I love jogging, and you?",
      "I love reading!"]
tk.fit_on_texts(corpus)

# Show the Mapping Table
print(tk.word_index)    # WORD vs. NUMBER
print(tk.index_word)    # NUMBER vs. WORD

{'NiD': 1, 'i': 2, 'love': 3, 'jogging': 4, 'and': 5, 'you': 6, 'reading': 7}
{1: 'NiD', 2: 'i', 3: 'love', 4: 'jogging', 5: 'and', 6: 'you', 7: 'reading'}


In [9]:
# Test for Mapping Text into Sequence
input_text = ["I love jogging!",
        "and I love reading, too!"]

seq = tk.texts_to_sequences(input_text)
print(seq)

# Test for Mapping Sequence into Text
text = tk.sequences_to_texts(seq)
print(text)

[[2, 3, 4], [5, 2, 3, 7, 1]]
['i love jogging', 'and i love reading NiD']


# 序列對齊（Sequence Alignment）

In [10]:
# Create a Sequence Padding Object
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_seq = pad_sequences(
        sequences=seq,
        maxlen=5,
        dtype="int32",
        padding="pre",
        truncating="post",
        value=0
    )

print(padded_seq)

[[0 0 2 3 4]
 [5 2 3 7 1]]


# 編碼（Encoding）

In [11]:
# One-Hot Encoding
from tensorflow.keras.utils import to_categorical

print("獨熱編碼 -------------")
print(to_categorical(padded_seq))

獨熱編碼 -------------
[[[1. 0. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [12]:
# Multi-Hot Encoding
print("多熱編碼 -------------")
print(tk.texts_to_matrix(input_text))

多熱編碼 -------------
[[0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 1. 1. 1. 0. 1. 0. 1.]]


In [13]:
# Word Embedding
import tensorflow as tf
from tensorflow.keras import layers

emb = layers.Embedding(8, 3)

# tf.constant(): Convert immediate values into tensor
result = emb(tf.constant(padded_seq))
print("詞向量嵌入 -------------")
print(result.numpy())

詞向量嵌入 -------------
[[[-2.8549397e-02 -1.6903508e-02  1.2396477e-02]
  [-2.8549397e-02 -1.6903508e-02  1.2396477e-02]
  [-3.8799167e-02  3.9433029e-02 -3.1143416e-02]
  [ 3.7658226e-02 -2.5706291e-03  1.8841151e-02]
  [-2.7186586e-02  2.7237687e-02 -2.7955068e-02]]

 [[ 1.7981578e-02 -7.7474862e-05  4.0822480e-02]
  [-3.8799167e-02  3.9433029e-02 -3.1143416e-02]
  [ 3.7658226e-02 -2.5706291e-03  1.8841151e-02]
  [ 2.2235621e-02  6.1217919e-03 -1.4494441e-02]
  [-3.1131400e-02  3.0269388e-02 -3.4443438e-02]]]
