In [1]:
import sudachipy
sudachipy.__file__

'/opt/conda/lib/python3.7/site-packages/sudachipy/__init__.py'

In [3]:
!ls /opt/conda/lib/python3.7/site-packages/sudachidict_core/resources

LEGAL  LICENSE-2.0.txt	system.dic


In [7]:
!sudachipy ubuild \
  -s /opt/conda/lib/python3.7/site-packages/sudachidict_core/resources/system.dic \
  add_term.csv

reading the source file...1 words
writing the POS table...2 bytes
writing the connection matrix...4 bytes
building the trie...done
writing the trie...1028 bytes
writing the word-ID table...9 bytes
writing the word parameters...10 bytes
writing the word_infos...48 bytes
writing word_info offsets...4 bytes


In [11]:
!ls /opt/conda/lib/python3.7/site-packages/sudachipy/resources

char.def  rewrite.def  sudachi.json  unk.def


In [5]:
!pwd

/home/jovyan


In [8]:
import spacy

nlp = spacy.load('ja_ginza')
doc = nlp('アナと雪の女王を観に行きたい')

for sent in doc.sents:
    for token in sent:
        print(token.orth_)

アナと雪の女王
を
観
に
行き
たい


# 万病辞書変換

In [3]:
import sys
import unicodedata


class SudachiCharNormalizer:
    def __init__(self, rewrite_def_path="./rewrite.def"):
        self.ignore_normalize_set = set()
        self.replace_char_map = {}
        self.read_rewrite_def(rewrite_def_path)
        
    def read_rewrite_def(self, rewrite_def_path):
        with open(rewrite_def_path, encoding="utf8") as f:
            for i, line in enumerate(f):
                line = line.strip()
                if line.startswith("#") or not line:
                    continue

                cols = line.split()
                if len(cols) == 1:
                    if len(cols[0]) != 1:
                        raise Exception("'{}' is not a single character at line {}".format(cols[0], i))
                    self.ignore_normalize_set.add(cols[0])
                elif len(cols) == 2:
                    if cols[0] in self.replace_char_map:
                        raise Exception("'Replacement for '{}' defined again at line {}".format(cols[0], i))
                    self.replace_char_map[cols[0]] = cols[1]
                else:
                    raise Exception("Invalid format '{}' at line {}".format(line, i))
                    
    def rewrite(self, text):
        chars_after = []

        offset = 0
        next_offset = 0
        i = -1
        while True:
            i += 1
            if i >= len(text):
                break
            textloop = False
            offset += next_offset
            next_offset = 0

            # 1. replace char without normalize
            for l in range(len(text) - i, 0, -1):
                replace = self.replace_char_map.get(text[i:i+l])
                if replace:
                    chars_after.append(replace)
                    next_offset += len(replace) - l
                    i += l - 1
                    textloop = True
                    continue
            if textloop:
                continue

            # 2. normalize    
            # 2-1. capital alphabet (not only latin but greek, cyrillic, etc) -> small
            original = text[i]
            lower = original.lower()
            if lower in self.ignore_normalize_set:
                replace = lower
            else:
                # 2-2. normalize (except in ignoreNormalize)
                # e.g. full-width alphabet -> half-width / ligature / etc.
                replace = unicodedata.normalize("NFKC", lower)
            next_offset = len(replace) - 1
            chars_after.append(replace)

        return "".join(chars_after)

In [16]:
import pandas as pd

def extract_yomigana(s):
    yomi = s.split(";")[0]
    if yomi != "nan":
        return yomi
    else:
        return ""


def convert_dict_format(s, normalizer):
    midashi = normalizer.rewrite(s[0]).replace(",", "、")  # 出現形
    word = s[0]  # 出現形
    yomi = extract_yomigana(s[4])  # 複合文字列ラベル
    return f"{midashi},4786,4786,8000,{midashi},名詞,固有名詞,一般,*,*,*,{yomi},{word},*,*,*,*,*"

In [17]:
normalizer = SudachiCharNormalizer(rewrite_def_path="/opt/conda/lib/python3.7/site-packages/sudachipy/resources/rewrite.def")

In [10]:
df = pd.read_excel("../data/MANBYO_20190704.xlsx")

In [11]:
df

Unnamed: 0,出現形,ICDコード,標準病名,信頼度LEVEL,しゅつげんけい;icd=ICDコード;lv=信頼度LEVEL/freq=頻度LEVEL;標準病名
0,疼痛,R529,疼痛,S,とうつう;icd=R529;lv=S/freq=高頻度;疼痛
1,発熱,R509,発熱,S,はつねつ;icd=R509;lv=S/freq=高頻度;発熱
2,嘔気,R11,嘔気,S,おうき;icd=R11;lv=S/freq=高頻度;嘔気
3,出血,R58,出血,S,しゅっけつ;icd=R58;lv=S/freq=高頻度;出血
4,糖尿病,E14,糖尿病,S,とうにょうびょう;icd=E14;lv=S/freq=高頻度;糖尿病
...,...,...,...,...,...
362861,関節可動閾障害,,,F,nan;icd=nan;lv=F/freq=低頻度;nan
362862,喉頭知覚亢進,,,F,nan;icd=nan;lv=F/freq=低頻度;nan
362863,側面濁音なし,,,F,nan;icd=nan;lv=F/freq=低頻度;nan
362864,脾実質石灰化,,,F,nan;icd=nan;lv=F/freq=低頻度;nan


In [12]:
icd_dict = {"出現形":[], "ICD": []}
for i in df[["出現形", "ICDコード"]].values:
    icd_dict["出現形"].append(i[0])
    icd_dict["ICD"].append(i[1])

In [13]:
df_icd = pd.DataFrame(icd_dict)

In [14]:
df_icd.to_csv("icd.csv")

In [18]:
target_df = df[df["信頼度LEVEL"].isin(["S", "A", "B", "C"])]
output_df = target_df.apply(lambda x: convert_dict_format(x, normalizer), axis=1)
output_df.to_csv("manbyo20190704_sabc_dic.txt", header=None, index=None, sep="|")


output_df = df.apply(lambda x: convert_dict_format(x, normalizer), axis=1)
output_df.to_csv("manbyo20190704_all_dic.txt", header=None, index=None, sep="|")

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [18]:
!sudachipy ubuild \
  -s /opt/conda/lib/python3.7/site-packages/sudachidict_core/resources/system.dic \
  manbyo20190704_all_dic.txt -o user_mall.dic

reading the source file...362866 words
writing the POS table...2 bytes
writing the connection matrix...4 bytes
building the trie...done
writing the trie...13386756 bytes
writing the word-ID table...1813299 bytes
writing the word parameters...2177200 bytes
writing the word_infos...12334775 bytes
writing word_info offsets...1451464 bytes


In [19]:
!sudachipy ubuild \
  -s /opt/conda/lib/python3.7/site-packages/sudachidict_core/resources/system.dic \
  manbyo20190704_sabc_dic.txt -o user_msabc.dic

reading the source file...73342 words
writing the POS table...2 bytes
writing the connection matrix...4 bytes
building the trie...done
writing the trie...2954244 bytes
writing the word-ID table...366565 bytes
writing the word parameters...440056 bytes
writing the word_infos...3228270 bytes
writing word_info offsets...293368 bytes


In [21]:
import spacy

nlp = spacy.load('ja_ginza')
doc = nlp('線維筋痛症になった')

for sent in doc.sents:
    for token in sent:
        print(token.orth_)

線維筋痛症
に
なっ
た
