<a href="https://colab.research.google.com/github/ishizue-da/TestRepository/blob/master/CreateMecabDictionary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 同義語変換辞書

In [None]:
!pip install importnb
!pip install ipdb

In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import re
import pickle
from collections import defaultdict
from functools import reduce

import pandas as pd
import ipdb

selfDir = os.path.dirname(os.path.abspath("__file__"))
sys.path.append(os.path.join(selfDir, '../NLPT/src'))

# from JapaneseTextPreProcessor import JapaneseTextPreprocess

import importnb
with __import__('importnb').Notebook(): 
    from JapaneseTextPreProcessor import JapaneseTextPreprocess


#========================================================
# csv形式の用語集を読み込み、用語および同義語対応辞書を作成
#
# TermListFile : csv形式の用語集
# SynonymDict  : 用語および同義語対応辞書
# TermColName  : 慣用語文字列
#========================================================
def makeDictionary(TermListFile, SynonymDict = {}, TermColName = '慣用語'):
    #csv形式の用語集を読み込み、用語および同義語対応辞書を作成
    
    #前処理ライブラリのインスタンス呼び出し
    pre = JapaneseTextPreprocess()
    
    #用語集読み込み
    TermListDF = pd.read_csv(TermListFile).fillna("")
    
    #同義語変換辞書構築
    for i,term in enumerate(list(TermListDF['用語'].values)):
        if len(term)==0: continue
        # カナを全角、英字を小文字、全角英数字を半角、記号文字を半角
        term = pre.normalizeHanzen(term).lower()
        #用語に含まれる（）内を削除
        term = re.sub(r'[(|\[].*?[)|\]]', '', term)
        #空白\xa0を削除し、,区切りで用語を分割
        #termList = [t for t in term.replace('\xa0','').split(',') if len(t)>1]
        termList = [t for t in re.split('[,・\n]', term.replace('\xa0','')) if len(t)>1]
        #termList = [pre.replacePunctuations(term, replacer='') for term in termList]
        
        if len(termList)==0: continue
        MainTerm = termList[0]
        if MainTerm in SynonymDict:
            # すでに存在
            if len(termList)>1: SynonymDict[MainTerm] |= set(termList[1:])
        else:
            SynonymDict[MainTerm] = set([]) if len(termList)==1 else set(termList[1:])
        
        # 慣用語
        synonym = pre.normalizeHanzen(TermListDF[TermColName][i]).lower()
        synonym = re.sub(r'[(|\[].*?[)|\]]', '', synonym)
        synonymList = set([t for t in re.split('[,・\n]', synonym.replace('\xa0','')) if len(t)>1])
        # すでに登録してあるか確認
        if MainTerm in synonymList: synonymList.remove(MainTerm)
        
        if len(synonymList)>0:
            # 用語（MainTerm）に追加
            SynonymDict[MainTerm] |= synonymList
        
        #print(MainTerm, SynonymDict[MainTerm])
    return SynonymDict
    
#=============================================================
# 同義語変換辞書のうち、keyとvalueの両方に現れる語をマージ
# SynonymDict  : 用語および同義語対応辞書
#=============================================================
def checkDictConflict(SynonymDict):
    #同義語変換辞書のうち、keyとvalueの両方に現れる語を検索

    conflict = set(SynonymDict.keys()) & set(reduce(lambda a, b: a | b, SynonymDict.values()))
    ipdb.set_trace() # dbg
    #print('conflict:', conflict)
    mergeTerm = defaultdict(set)
    for term in conflict:
        #print(term+":", SynonymDict[term])
        for k,v in SynonymDict.items():
            # k=用語、v=慣用語
            if term in v:
                mergeTerm[term].add(k)
                #print(k+":", SynonymDict[k])
    #keyとvalueの両方に現れる語をマージ
    for k,v in mergeTerm.items():
        #print('k:',k, SynonymDict[k])
        if k in SynonymDict:
            merged = False
            for repTerm in v:
                if repTerm in SynonymDict:
                    #print('v:',repTerm, SynonymDict[repTerm])
                    SynonymDict[repTerm] |= SynonymDict[k]
                    merged = True
            if merged: del SynonymDict[k]
    # 慣用語の同じものを整理
    conflict = set(SynonymDict.keys()) & set(reduce(lambda a, b: a | b, SynonymDict.values()))
    
    return SynonymDict
    
#=========================================================================
# 同義語変換辞書をmecabのユーザー辞書形式に変換
# input :
#    SynonymDict : 同義語変換辞書
# output :
#    <登録したい用語>,<ID>,<ID>,<重み>, <品詞>,<品詞の説明>,<*>,<*>,<活用>,
#    <活用形>,<登録したい動詞の原形>,<カタカナ表示>,<カタカナ表記>
#    のリストデータ
#=========================================================================
def transformMecabFormat(SynonymDict):
    #作成した同義語変換辞書をmecabのユーザー辞書形式に変換
    mecabDict = []
    for term, synonyms in SynonymDict.items():
        weight = -1 * int(pow(len(term), 1.3))
        DictInfo = [term, 1285, 1285, weight, '名詞', '用語集', '*', '*', '*', '*', term, '*', '*']
        mecabDict.append(DictInfo)
        for synonym in synonyms:
            weight = -1 * int(pow(len(synonym), 1.3))
            DictInfo = [synonym, 1285, 1285, weight, '名詞', '用語集', '*', '*', '*', '*', term, '*', '*']
            mecabDict.append(DictInfo)
    return mecabDict
    

def cleansingPartsCategory(TermListDF, TermsCol='用語'):
    #csv形式の機器カテゴリ表をクレンジング
    
    #前処理ライブラリのインスタンス呼び出し
    pre = JapaneseTextPreprocess()
    
    #クレンジング
    for index, row in TermListDF.iterrows():
        term = row[TermsCol]
        term = pre.normalizeHanzen(term).lower()
        #用語に含まれる（）内を削除し、空白\xa0を削除
        term = re.sub(r'[(|\[].*?[)|\]]', '', term).replace('\xa0','')
        
        TermListDF.at[index, TermsCol] = term
        
    return TermListDF
    



"""
if __name__ == "__main__":
    print('start')
    TermListFile = os.path.join(selfDir, '../data/火力発電用語ボイラ及び附属装置.csv')
    
    #csv形式の用語集を読み込み、慣用語→用語変換辞書を作成
    SynonymDict = makeDictionary(TermListFile)
    SynonymDict = checkDictConflict(SynonymDict)
    
    #pickle化して保存
    with open(TermListFile[:-4]+'.pickle', mode='wb') as f:
        pickle.dump(SynonymDict, f)
    
    with open(os.path.splitext(TermListFile)[0]+'.pickle', mode='rb') as f:
        SynonymDict = pickle.load(f)
    
    #辞書をMecab形式csvに変換
    transformMecabFormat(SynonymDict, os.path.splitext(TermListFile)[0])
    
    print('Dictionary Creation Process was done.')
    print('Next: execute under command')
    print('/usr/local/libexec/mecab/mecab-dict-index -d /usr/local/lib/mecab/dic/ipadic -u /path/to/userdic.dic -f utf-8 -t utf-8 /path/to/userdic.csv')
"""    
    
    
