# 自製智能中文選字系統  (1)

## 資料前處理

確認版本為 python3

In [9]:
import sys
from tqdm import tqdm
sys.version

'3.6.8 (v3.6.8:3c6b436a57, Dec 24 2018, 02:04:31) \n[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]'

In [2]:
import re

In [3]:
def prepocess_line(line):
    # 僅僅挑出中文字元，並且斷開不連續的中文字
    
    segments = re.findall(r'[\u4e00-\u9fa5]+', line)    

    return segments

In [4]:
prepocess_line('“英語”一詞源於遷居英格蘭的日耳曼部落盎格魯（），而“盎格魯”得名於')  

['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

In [5]:
segments = []
with open('./wiki_zh_small.txt') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)

In [10]:
len(segments)

48768

In [11]:
segments[:10]

['英語',
 '英語英語',
 '又稱爲英文',
 '是一種西日耳曼語言',
 '誕生於中世紀早期的英格蘭',
 '如今具有全球通用語的地位',
 '英語',
 '一詞源於遷居英格蘭的日耳曼部落盎格魯',
 '而',
 '盎格魯']

---
## 計算詞頻

一開始要先計算字詞出現的次數

In [12]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]  # 分別代表計算0、1、...個字的出現次數

    def fit(self, segments):
        # 因為 self.counters 分別代表計算0、1、...個字的出現次數
        # 請在此實作利用 segments 以及函式 _skip 來統計次數

        for i in range(1, 1 + self.n):
            for seg in tqdm(segments):
                # 目前處理句子 = seg
                # 此句每個字出現次數 = Counter(self._skip(seg, i))
                self.counters[i] += Counter(self._skip(seg, i))

        count = sum(dict(self.counters[1]).values()) # 計算字的總數
        self.counters[0] = Counter({'': count}) # 新增一欄來計算總數
        

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i + shift + 1]

In [13]:
counters = Counters(n=3)
counters.fit(segments)
counters[0]

100%|██████████| 48768/48768 [00:05<00:00, 8217.08it/s] 
100%|██████████| 48768/48768 [01:53<00:00, 431.36it/s]
100%|██████████| 48768/48768 [03:17<00:00, 247.47it/s]


Counter({'': 371370})

In [14]:
counters[1]['英']

420

In [15]:
counters[0]

Counter({'': 371370})

---
## N-Gram

In [16]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1
        # 使用 Ngram 的公式計算出下一個字出現的機率
        # 輸出為機率與字的tuple列表，詳見下方輸出範例
        
        if self.n > 1:
            pre = prefix[-(self.n - 1):]
        else:
            pre = ''
        
        count_pre = self.minor_counter[pre]
        probs = []
        for k, count in dict(self.major_counter).items():
            if k.startswith(pre):
                prob = count / count_pre
                probs.append((prob, k[-1]))
        
        sorted_probs = sorted(probs, reverse=True)
        
        return sorted_probs[:top_k] if top_k > 0 else sorted_probs

    def get_proba_dict(self, prefix=''):
        return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}

In [17]:
unigram = Ngram(1, counters)

In [18]:
unigram.predict_proba('我思')

[(0.035732557826426474, '的'),
 (0.012927807846621966, '國'),
 (0.010620136252255163, '中'),
 (0.00998465142580176, '在'),
 (0.009852707542343216, '一')]

In [19]:
bigram = Ngram(2, counters)
trigram = Ngram(3, counters)

## 使用Ngram來建立第一版選字系統

In [20]:
class ChineseWordRecommenderV1:
    def __init__(self, unigram, bigram, trigram):
        self.unigram = unigram
        self.bigram = bigram
        self.trigram = trigram
    
    def predict_proba(self, prefix='', top_k=5):
        # 使用Ngram來建立選字系統
        if len(prefix) > 1:
            probs = self.trigram.predict_proba(prefix, top_k)
            return probs
        elif len(prefix) == 1:
            probs = self.bigram.predict_proba(prefix, top_k)
            return probs
        else:
            probs = self.unigram.predict_proba(prefix, top_k)
            return probs

In [21]:
model = ChineseWordRecommenderV1(unigram, bigram, trigram)

In [22]:
probs = model.predict_proba('我思', top_k=10)
probs

[(0.75, '故'), (0.25, '維')]

In [23]:
probs = model.predict_proba('我', top_k=10)
probs

[(0.40298507462686567, '們'),
 (0.07462686567164178, '的'),
 (0.04477611940298507, '在'),
 (0.01990049751243781, '是'),
 (0.01990049751243781, '思'),
 (0.014925373134328358, '比'),
 (0.014925373134328358, '不'),
 (0.009950248756218905, '這'),
 (0.009950248756218905, '深'),
 (0.009950248756218905, '最')]

## Demo

In [25]:
import ipywidgets as widgets

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='value')

Label(value='')

Textarea(value='')