## 安裝 jieba 套件

In [2]:
!pip install jieba

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
[K     |████████████████████████████████| 19.2 MB 19.8 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314478 sha256=f83ff97392ba92e2bcbe4343120279c0516377a419876f1545ee0dfc4cc390fb
  Stored in directory: /home/jovyan/.cache/pip/wheels/24/aa/17/5bc7c72e9a37990a9620cc3aad0acad1564dcff6dbc2359de3
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1


## 建立新目錄，存放辭典檔案

In [3]:
!mkdir jieba_data

## 下載繁體字的辭典檔

In [4]:
!wget https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big -O jieba_data/dict.txt.big

--2020-07-13 08:59:04--  https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
Resolving github.com (github.com)... 52.192.72.89
Connecting to github.com (github.com)|52.192.72.89|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/fxsjy/jieba/master/extra_dict/dict.txt.big [following]
--2020-07-13 08:59:05--  https://raw.githubusercontent.com/fxsjy/jieba/master/extra_dict/dict.txt.big
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.108.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8583143 (8.2M) [text/plain]
Saving to: ‘jieba_data/dict.txt.big’


2020-07-13 08:59:07 (118 MB/s) - ‘jieba_data/dict.txt.big’ saved [8583143/8583143]



## 載入 Jieba 套件

In [5]:
import jieba

### 檢查 Jieba 版本

In [6]:
jieba.__version__

'0.42.1'

## 指定辭典檔

In [7]:
jieba.set_dictionary('jieba_data/dict.txt.big')

In [43]:
!head -10 jieba_data/dict.txt.big

1号店 3 n
1號店 3 n
4S店 3 n
4s店 3 n
AA制 3 n
AB型 3 n
AT&T 3 nz
A型 3 n
A座 3 n
A股 3 n


## 分詞

In [21]:
text_str = '今天天氣真好'

In [28]:
?jieba.cut

### 精確模式分詞 (cut_all=False)

In [23]:
seg_result = jieba.cut(text_str, cut_all=False)
print(' / '.join(list(seg_result)))

Building prefix dict from /home/jovyan/work/Text_wordcloud/jieba_data/dict.txt.big ...
Dumping model to file cache /tmp/jieba.u240ff8c1e70462be159af457c3f6d652.cache
Loading model cost 1.998 seconds.
Prefix dict has been built successfully.


今天天氣 / 真 / 好


### 全模式分詞 (cut_all=True)

In [24]:
seg_result = jieba.cut(text_str, cut_all=True)
print(' / '.join(list(seg_result)))

今天 / 今天天氣 / 天天 / 天氣 / 真好


### 搜尋引擎模式分詞

In [25]:
?jieba.cut_for_search

In [26]:
seg_result = jieba.cut_for_search(text_str, HMM=True)
print(' / '.join(list(seg_result)))

今天 / 天天 / 天氣 / 今天天氣 / 真 / 好


In [27]:
seg_result = jieba.cut_for_search(text_str, HMM=False)
print(' / '.join(list(seg_result)))

今天 / 天天 / 天氣 / 今天天氣 / 真 / 好


### Paddle 模式分詞 (use_paddle=True)

In [33]:
jieba.enable_paddle()
seg_result = jieba.cut(text_str, use_paddle=True)
print(' / '.join(list(seg_result)))

Paddle enabled successfully......


今天 / 天氣 / 真好


## 詞性標註

#### 詞性說明: https://gist.github.com/luw2007/6016931

In [31]:
import jieba.posseg as pseg
jieba.enable_paddle()
seg_result = pseg.lcut(text_str, use_paddle=True)
for w, p in seg_result:
    print("%s, %s"%(w, p))

Paddle enabled successfully......


今天, TIME
天氣, n
真好, a


## 自定義詞庫

In [34]:
text_str_2 = '是在哈囉嗎?'

In [74]:
!cat jieba_data/mydict.txt

是在哈囉

In [35]:
seg_result = jieba.lcut(text_str_2)
print(' / '.join(seg_result))

是 / 在 / 哈囉 / 嗎 / ?


In [37]:
jieba.load_userdict('jieba_data/mydict.txt')

In [38]:
seg_result = jieba.lcut(text_str_2)
print(' / '.join(seg_result))

是在哈囉 / 嗎 / ?


In [39]:
!cat jieba_data/mydict.txt

是在哈囉

## 停止字

In [51]:
!wget https://raw.githubusercontent.com/fxsjy/jieba/master/extra_dict/stop_words.txt -O jieba_data/stop_words.txt

--2020-07-13 15:58:00--  https://raw.githubusercontent.com/fxsjy/jieba/master/extra_dict/stop_words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 222 [text/plain]
Saving to: ‘jieba_data/stop_words.txt’


2020-07-13 15:58:00 (13.3 MB/s) - ‘jieba_data/stop_words.txt’ saved [222/222]



In [52]:
!tail -5 jieba_data/stop_words.txt

你們
妳們
他們
她們
是否

In [55]:
# 開檔後一次讀一行
stop_words_list = []
with open(file='jieba_data/stop_words.txt',mode='r', encoding="UTF-8") as file:
    for line in file:
        line = line.strip()
        stop_words_list.append(line)
stop_words_list

['the',
 'of',
 'is',
 'and',
 'to',
 'in',
 'that',
 'we',
 'for',
 'an',
 'are',
 'by',
 'be',
 'as',
 'on',
 'with',
 'can',
 'if',
 'from',
 'which',
 'you',
 'it',
 'this',
 'then',
 'at',
 'have',
 'all',
 'not',
 'one',
 'has',
 'or',
 'that',
 '的',
 '了',
 '和',
 '是',
 '就',
 '都',
 '而',
 '及',
 '與',
 '著',
 '或',
 '一個',
 '沒有',
 '我們',
 '你們',
 '妳們',
 '他們',
 '她們',
 '是否']

In [72]:
# 開檔後一次讀完，再進行切割
with open(file='jieba_data/stop_words.txt',mode='r', encoding="UTF-8") as file:
    #依照換行字元 \n 進行切割，切完為 list
    stop_words = file.read().split('\n')
    print(type(stop_words))
    print(stop_words)

<class 'list'>
['the', 'of', 'is', 'and', 'to', 'in', 'that', 'we', 'for', 'an', 'are', 'by', 'be', 'as', 'on', 'with', 'can', 'if', 'from', 'which', 'you', 'it', 'this', 'then', 'at', 'have', 'all', 'not', 'one', 'has', 'or', 'that', '的', '了', '和', '是', '就', '都', '而', '及', '與', '著', '或', '一個', '沒有', '我們', '你們', '妳們', '他們', '她們', '是否']


In [71]:
text_str_3 = '我是一位小學生，從小學習鋼琴，希望成為youtuber'
seg_result = jieba.lcut(text_str_3)
seg_result_stopword = []
for term in seg_result:
    if term not in stop_words:
        seg_result_stopword.append(term)
seg_result_stopword

['我', '一位', '小學生', '，', '從小', '學習', '鋼琴', '，', '希望', '成為', 'youtuber']

## 其他

In [47]:
print(jieba.DEFAULT_DICT_NAME)
# check jieba/jieba/__init__.py

dict.txt
