# ginza (sudachi)へのユーザ辞書の追加

In [1]:
import os
if "src" in os.getcwd():
    os.chdir("../..")
print(os.getcwd())

/workspace


In [2]:
import spacy, ja_ginza, sudachipy, sudachidict_core

In [4]:
spacy.__version__, ja_ginza.__version__, sudachipy.__version__

('3.2.2', '5.1.0', '0.6.3')

In [87]:
user_dic_list = [
    ["名詞-固有名詞-人名-一般", "ルイズ・フランソワーズ"],
    ["名詞-固有名詞-一般", "ゼロの使い魔"],
]

In [89]:
import re
import unicodedata


def _unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s

def _remove_extra_spaces(s):
    """
    余計なスペースは除去
    """
    s = re.sub('[ 　]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

def _normalize_neologd(s):
    """
    全角の記号は半角に変換、大文字は小文字に
    """
    s = s.strip()
    s = _unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]+', '〜', s)  # normalize tildes (modified by Isao Sonobe)
    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

    s = _remove_extra_spaces(s)
    s = _unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    return s

def normalize_text(text):
    text = text.replace("\n", "").replace("\r", "").replace("\t", " ")
    #text = text.replace("・", "")
    text = text.lower()
    return text
    

# 辞書に格納

[sudachi ユーザ辞書 公式のドキュメント](https://github.com/WorksApplications/Sudachi/blob/develop/docs/user_dict.md)

以下のlistを形成して、listに追加していく

columns = ["見出し", "左連接ID", "右連接ID", "コスト", "見出し", "品詞1", "品詞2", "品詞3", "品詞4", 
           "品詞 (活用型)", "品詞 (活用形)", "読み", "正規化表記", "辞書形ID", "分割タイプ", "A単位分割情報", "B単位分割情報", "未使用"]

In [90]:
def hinshi_format(hinshi):
    if hinshi == "名詞-固有名詞-人名-一般":
        hinshi = "名詞"
        hinshi_t1 = "固有名詞"
        hinshi_t2 = "人名"
        hinshi_t3 = "一般"
        rensa_id = 4788
    elif hinshi == "名詞-一般":
        hinshi = "名詞"
        hinshi_t1 = "普通名詞"
        hinshi_t2 = "一般"
        hinshi_t3 = "*"
        rensa_id = 5146
    elif hinshi == "名詞-サ変接続":
        hinshi = "名詞"
        hinshi_t1 = "普通名詞"
        hinshi_t2 = "サ変接続"
        hinshi_t3 = "*"
        rensa_id = 5133
    elif hinshi == "名詞-固有名詞-一般":
        hinshi = "名詞"
        hinshi_t1 = "固有名詞"
        hinshi_t2 = "一般"
        hinshi_t3 = "*"
        rensa_id = 4786
    elif hinshi =="名詞-固有名詞-地名-一般":
        hinshi = "名詞"
        hinshi_t1 = "固有名詞"
        hinshi_t2 = "地名"
        hinshi_t3 = "一般"
        rensa_id = 4792
    elif hinshi == "名詞-固有名詞-人名-姓":
        hinshi = "名詞"
        hinshi_t1 = "固有名詞"
        hinshi_t2 = "人名"
        hinshi_t3 = "姓"
        rensa_id = 4790
    elif hinshi == "名詞-固有名詞-人名-名":
        hinshi = "名詞"
        hinshi_t1 = "固有名詞"
        hinshi_t2 = "人名"
        hinshi_t3 = "名"
        rensa_id = 4789
    elif hinshi == "形容詞":
        # IPAでは、形容動詞は名詞の形容動詞語幹として含まれ、 形容詞には含まれない
        # sudachiでは、Juman => 形容詞
        hinshi = "形容詞"
        hinshi_t1 = "一般"
        hinshi_t2 = "*"
        hinshi_t3 = "*"
        rensa_id = 5161
    else:
        print(hinshi)
        raise ValueError(f"ルールにない品詞が含まれいます。{hinshi}")
    return hinshi, hinshi_t1, hinshi_t2, hinshi_t3, rensa_id

In [91]:
new_user_dic = []

for hinshi, hyouki in user_dic_list:
    # 本当は、より細かい正規化が必要
    hyouki = normalize_text(hyouki)

    cost = -5000 + int(10000 / len(hyouki)) # 名詞の場合5000 ~ 9000が推奨らしいが、かなり低めに設定
    hinshi, hinshi_t1, hinshi_t2, hinshi_t3, rensa_id = hinshi_format(hinshi)
    
    sudashi_form =  [hyouki, rensa_id, rensa_id, cost, hyouki, hinshi, hinshi_t1, hinshi_t2, hinshi_t3,"*","*", hyouki, hyouki,"*","*","*","*","*"]
    new_user_dic.append(sudashi_form)
print(new_user_dic[0])
print(new_user_dic[-1])

['ルイズ・フランソワーズ', 4788, 4788, -4091, 'ルイズ・フランソワーズ', '名詞', '固有名詞', '人名', '一般', '*', '*', 'ルイズ・フランソワーズ', 'ルイズ・フランソワーズ', '*', '*', '*', '*', '*']
['東京オリンピック', 4786, 4786, -3750, '東京オリンピック', '名詞', '固有名詞', '一般', '*', '*', '*', '東京オリンピック', '東京オリンピック', '*', '*', '*', '*', '*']


# .sudachiにユーザ定義辞書を保存

In [92]:
import os
cwd = os.getcwd()
print(cwd)
dic_dir = f"{cwd}/.sudachi"
os.makedirs(dic_dir, exist_ok=True)

/workspace


In [93]:
columns = ["見出し", "左連接ID", "右連接ID", "コスト", "見出し", "品詞1", "品詞2", "品詞3", "品詞4", 
           "品詞 (活用型)", "品詞 (活用形)", "読み", "正規化表記", "辞書形ID", "分割タイプ", "A単位分割情報", "B単位分割情報", "未使用"]

In [94]:
import pandas as pd
df = pd.DataFrame(new_user_dic, columns=columns)
df.head()
df.to_csv(f"{dic_dir}/user_dic.csv", header=None, index=False)

# ユーザ定義辞書をビルド

In [95]:
import sys

site_package = ""
for p in sys.path:
    if "site-packages" in os.path.basename(p):
        site_package = p
print(site_package)

/root/.pyenv/versions/3.9.7/lib/python3.9/site-packages


In [96]:
# user dicの作成 (.sudachi/user.dicが作成される)
!rm {dic_dir}/user.dic
!sudachipy ubuild -s '{site_package}/sudachidict_core/resources/system.dic' {dic_dir}/user_dic.csv -o {dic_dir}/user.dic

/workspace/.sudachi/user_dic.csv -> 3 in 0.01 sec
validate -> 3 in 0.00 sec
pos_table -> 2 in 0.00 sec
conn_matrix -> 6 in 0.00 sec
trie -> 1028 in 0.00 sec
word_id table -> 19 in 0.00 sec
word_params -> 22 in 0.00 sec
wordinfo_offsets -> 12 in 0.00 sec
wordinfos (copy only) -> 92 in 0.00 sec


In [97]:
sudachi_config_path = os.path.join(site_package, "sudachipy/resources/sudachi.json")
print(sudachi_config_path)
!cat {sudachi_config_path}

/root/.pyenv/versions/3.9.7/lib/python3.9/site-packages/sudachipy/resources/sudachi.json
{
    "systemDict" : null,
    "characterDefinitionFile" : "char.def",
    "inputTextPlugin" : [
        { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" },
        { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkPlugin",
          "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"],
          "replacementSymbol": "ー"},
	    { "class": "com.worksap.nlp.sudachi.IgnoreYomiganaPlugin",
          "leftBrackets": ["(", "（"],
          "rightBrackets": [")", "）"],
          "maxYomiganaLength": 4}
    ],
    "oovProviderPlugin" : [
        { "class" : "com.worksap.nlp.sudachi.MeCabOovPlugin",
          "charDef" : "char.def",
          "unkDef" : "unk.def" },
        { "class" : "com.worksap.nlp.sudachi.SimpleOovPlugin",
          "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
          "leftId" : 5968,
          "rightId" : 5968,
          "cost" : 3857 }
    ],
    "pathRewritePlugin"

In [98]:
# .sudachiにリソース関連をコピーし、設定ファイルを書き換える。
# "userDict" : ["/workspace/.sudachi/user.dic"] を追記
# !cp -r /root/.pyenv/versions/3.9.7/lib/python3.9/site-packages/sudachipy/resources ./.sudachi/

# ginzaでユーザ定義辞書を読み込む

In [105]:
from spacy.lang.ja import JapaneseTokenizer

# https://github.com/megagonlabs/ginza/blob/61cb655f2e5c85980f1a1bbc7d833623931e4235/ginza/analyzer.py
def try_sudachi_import(split_mode="A", config_path=None):
    try:
        from sudachipy import dictionary, tokenizer

        split_mode = {
            None: tokenizer.Tokenizer.SplitMode.A,
            "A": tokenizer.Tokenizer.SplitMode.A,
            "B": tokenizer.Tokenizer.SplitMode.B,
            "C": tokenizer.Tokenizer.SplitMode.C,
        }[split_mode]

        tok = dictionary.Dictionary(config_path=config_path).create(mode=split_mode)
        return tok
    except ImportError:
        raise ImportError(
            "Japanese support requires SudachiPy and SudachiDict-core "
            "(https://github.com/WorksApplications/SudachiPy). "
            "Install with `pip install sudachipy sudachidict_core` or "
            "install spaCy with `pip install spacy[ja]`."
        ) from None

class MySudachiTokenizer(JapaneseTokenizer):
    def __init__(self, nlp, config_path=None) -> None:
        self.nlp = nlp
        self.vocab = nlp.vocab
        self.split_mode= nlp.tokenizer.split_mode
        self.tokenizer = try_sudachi_import(
            split_mode=self.split_mode, config_path=config_path
        )
        
        self.need_subtokens = not (self.split_mode is None or self.split_mode == "A")
        

In [106]:
import spacy
import ginza

nlp = spacy.load('ja_ginza')
ginza.set_split_mode(nlp, "A")

rep_nlp = nlp

In [107]:
text = "ルイズ・フランソワーズは、ゼロの使い魔のヒロインです。"
text = normalize_text(text)
print(text)
doc = nlp(text)

for sent in doc.sents:
    for token in sent:
        print(token.i, token.orth_, token.tag_)

ルイズ・フランソワーズは、ゼロの使い魔のヒロインです。
0 ルイズ 名詞-固有名詞-人名-一般
1 ・ 補助記号-一般
2 フランソワーズ 名詞-固有名詞-人名-一般
3 は 助詞-係助詞
4 、 補助記号-読点
5 ゼロ 名詞-数詞
6 の 助詞-格助詞
7 使い魔 名詞-普通名詞-一般
8 の 助詞-格助詞
9 ヒロイン 名詞-普通名詞-一般
10 です 助動詞
11 。 補助記号-句点


In [110]:
config_path = "/workspace/.sudachi/resources/sudachi.json"
#config_path = None
rep_nlp.tokenizer = MySudachiTokenizer(rep_nlp, config_path=config_path)

In [111]:
doc = rep_nlp(text)

for sent in doc.sents:
    for token in sent:
        print(token.i, token.orth_, token.tag_)

0 ルイズ・フランソワーズ 名詞-固有名詞-人名-一般
1 は 助詞-係助詞
2 、 補助記号-読点
3 ゼロの使い魔 名詞-固有名詞-一般
4 の 助詞-格助詞
5 ヒロイン 名詞-普通名詞-一般
6 です 助動詞
7 。 補助記号-句点
