# 利用Python 比對兩個敦煌XML專案的缺字使用情況

## 1. 需求函式庫

In [2]:
from lxml import etree
import re
import os
import requests
import operator


## 2.設定參數

In [3]:
# XML 資料夾 與 檔名
XMLDIR="data"
XMLFILEs = [
  "LIN-pomo-S-3491.xml",
  "LIN-pomo-P-2187.xml"
  # "LIN3-Huanxiguowangyuan-P-3375v.xml"
  ]

# 輸出檔資料夾
OUTDIR="out"
# 輸出檔檔名
gcOUTFILE="g-out.html"

# 通假字(sic,corr)輸出檔檔名
scOUTFILE = "sc-out.html"

# 組合輸出檔位置
gcOUTPATH = os.path.join(".", OUTDIR, gcOUTFILE)
scOUTPATH = os.path.join(".", OUTDIR, scOUTFILE)

# 圖檔資料夾
IMGDIR = "imgs"


## 3. Orig比對相關副函式
### 3.1 Orig 擷取函式

In [4]:
def getORIG(xmlpath):
  # 回傳用
  origData ={}
  
  # 解析XML Tree
  xmltree = etree.parse(xmlpath)
  root = xmltree.getroot()
  
  # 抓取所有origNode
  origNodes = xmltree.xpath(
      '//tei:orig', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
  
  # 逐一處理OrigNode
  for node in origNodes:
    # orig 範例：
    # C1. <orig reg="年"><g ref="#A01194-037"/></orig>
    # C2. <orig reg="來" type="CJK">来</orig>
    key = node.get("reg")

    # C1. 有子節點，應該要是g
    if len(node) >0 and node[0].tag == "{http://www.tei-c.org/ns/1.0}g":
      map = node[0].get("ref")
      type ="g"
      # print("{}(g-ref)".format(key))
    elif len(node) > 0:
      map = etree.tostring(node, method="text",
                           encoding="utf-8", with_tail=False).decode("utf-8")
      type="text"
      # print("{}(to-string)".format(key))
    else: # C2
      map = node.text
      # print("{}(node-text)".format(key))
      type ="text"
    
    # 儲存 reg. vs. ref
    # 結構為： origData={reg:
    #                    {key:{"type":type, "count":c}, ..}
    #                  }
    appendToCountDict(origData,key, map, type)

  return origData


### 3.2 orig 內容列印

In [5]:
def orig_to_htmlstr(orig):
  s = ""
  
  for key in orig:
     imgstr = ""
     if orig[key]["type"] == "g":
        imgstr = "<img src='{}'>".format(os.path.join("..",IMGDIR,key[1:]+".png"))
     s += "<li>{}({}){}[{}]</li>".format(key, orig[key]
                                         ["type"], imgstr, orig[key]["count"])

  return "<ol>{}</ol>".format(s)


### 3.3 比對一組iwordRec內的map相似度

In [6]:
def record_similarity(iwordRecs):
  """
  情況：
    1. iwordRecs 是一個list，裡面有來字多個比較檔(依cmpOrd順序擺放)的資料，各xmlfn一格。
    2. 每一格都是該iword對應的map count info
    3. 因為是同一個 iword, 所以 iword 省略不傳
    4. 結構表示 iwordRecs = [{map1:{"type": type, "count": 1},map2:{...}},...]
  計算方式：
    1.每一格抓取各自的maps 成為 set1, set2, set3... setK, 個數表示為 N1, N2, N3,...NK
    2.計算 intersect(set1, set2, set3, ... setK), 個數為 NS
    3.相似度為 K*NS / (N1+N2+N3...NK)
  """
  setCount = 0  # 計算總數用
  keys_intersect = None # 計算交集 set
  for iwordDict in iwordRecs:
    setCount += len(iwordDict)  # 計算總數 NS
    if keys_intersect == None:
      keys_intersect = set(iwordDict.keys())  # 初次必須抓取完整交集 set
    else:
      keys_intersect = keys_intersect.intersection(
          set(iwordDict.keys()))  # 計算交集 set
          
  # 相似度為 K*NS / (N1+N2+N3...NK)
  return len(keys_intersect)*len(iwordRecs)/setCount


### 3.4 orig 比對與結果匯出

In [7]:
def origComp(origData, cmpOrder=[]):
  return countArrayCmp(origData, "缺字比較表", cmpOrder, orig_to_htmlstr)

### 3.5 AppendToCountDict 公用程式

In [8]:
def appendToCountDict(cData, key, map,type="default"):
  # 結構定義為： {key}: {map:{"type":type, "count":c}, ..}
  if not (key in cData):
    cData[key] = {map: {"type":type, "count": 1}}
  else:
    if not(map in cData[key]):
      cData[key][map] = {"type": type, "count": 1}
    else:
      cData[key][map]["count"] += 1

### 3.6 recordDispfunc 預設函式

In [9]:
def record_to_htmlstr(rec):
   # 結構為： rec={map:{"count":c}, ..}
  s = ""

  for map in rec:
     s += "<li>{}[{}]</li>".format(map, rec[map]["count"])

  return "<ol>{}</ol>".format(s)

### 3.7 countArray 比對與結果匯出

In [10]:
def countArrayCmp(cArray, caption, cmpOrder=[], recordDispfunc=record_to_htmlstr):
  """
    1. 目的為比對不同來源的 count Array 
       結構定義為： cArray={xmlfn:{iword: {map:{"type":type, "count":c}, ..}}}
    2. caption: 主標題，顯示用
    2. recordDispfunc = 要顯示一筆count資料時的函式, 客製化或預設
    3. cmpOrder 所指定的比對範圍，不指定就是全部比
    4. cmpOrder 指定方式，就是用origData 的key

  """

  rHTML = """
      <h1>{}</h1><h2>比對結果彙總</h2>
      <table>{}</table>
      <h2>細節內容</h2>
      <table>
        <thead>{}</thead><tbody>{}</tbody>
      </table>
  """  # 回傳字串

  # 處理 comOrder(比較順序) 的預設值
  if len(cmpOrder) == 0:
    cmpOrder = list(cArray.keys())

  # 產生比較表的表頭字串

  rTableHeadStr="<tr><th>No.</th><th>Reg</th>{}<th>相似程度</th></tr>".format("".join(["<th>"+c+"</th>" for c in cmpOrder]))
  print(rTableHeadStr)

  # all_regs_union = set() #所有regs 的 union
  # for cmpo in cmpOrder:
  #   all_regs_union = all_regs_union.union(set(origData[cmpo].keys()))

  # 準備比較對象，取 intersection
  all_iword_intersect = None  # 所有key 的 intersect
  for cmpo in cmpOrder:
    if all_iword_intersect == None:  # 取intersect 的第一步，必須全載入第一組。
      all_iword_intersect = set(cArray[cmpo].keys())
    else:
      all_iword_intersect = all_iword_intersect.intersection(
          set(cArray[cmpo].keys()))

  # 產生比較表的內容字串，因為需要排序，將暫時丟到矩陣中
  rTableRowStrs = []

  # 用來計算各種狀況的比例
  counts = {"fM": 0, "pM": 0, "uM": 0}  #fullMatch, partialMatch, unMatch


  # 逐一抓出重疊的regs 出來輸出與比對
  for iword in all_iword_intersect:
    rRegNoStr = "<td>{}</td>" # 第一欄 No. 因為需要排序後產生，暫時空著
    rRegHeadStr = "<td>{}</td>".format(iword)  # 第二欄 reg
    
    rRegXmlsStr = ""  # 紀錄不同xml內，相同Reg的內容，一個Reg內的內容，便經常會有多值，內容將產生為ol+li
    for cmpo in cmpOrder:
      if iword in cArray[cmpo]:
        # Reg的內容，經常會有多值，內容將產生為ol+li
        rRegXmlsStr += "<td>{}</td>".format(
            recordDispfunc(cArray[cmpo][iword]))
      else:
        # 若為 intersction 則永遠不會到這裡
        rRegXmlsStr += "<td></td>"

    # 計算相似度，並換為對應字串
    osim = record_similarity([cArray[cmpo][iword] for cmpo in cmpOrder])


    if osim == 1:
      counts["fM"]+=1
      rRegXmlsStr += "<td class='tBlue'>完全相同(100%)</td>"
    elif osim==0:
      counts["uM"] += 1
      rRegXmlsStr += "<td class='tRed'>完全不同(0%)</td>"
    else:
      counts["pM"] += 1
      rRegXmlsStr += "<td>部分相同</td>"

    # 將no. 的空欄，多個reg字串與osim值，以單一封裝來紀錄, 並保留osim 等待排序
    rTableRowStrs.append([rRegNoStr+rRegHeadStr+rRegXmlsStr, osim])

  # 利用第2欄位(osim值)，重新排序
  rTableRowStrs = sorted(rTableRowStrs, key=operator.itemgetter(1), reverse=True)
  # 先把 No. 值 以 i+1 封入後，再放入整體HTML索引
  rTableBodyStr = "".join(["<tr>{}</tr>".format(rowStr[0].format(i+1)) for i, rowStr in enumerate(rTableRowStrs)])
  

  # 產生index Table, 計算簡單，但格式複雜。
  rIndexTableStr = """
          <tr><th>分類</th><th>總數</th><th>比例</th></tr>
          <tr><td class='tBlue'>完全相同(100%)</td><td class='bold'>{1}</td><td>{4:.2f}%</td></tr>
          <tr><td>部分相同</td><td class='bold'>{2}</td><td>{5:.2f}%</td></tr>
          <tr><td class='tRed'>完全不同(0%)</td><td class='bold'>{3}</td><td>{6:.2f}%</td></tr>
          <tr><td>小計</td><td>{0}</td><td>-</td>""".format(
      len(all_iword_intersect), counts["fM"], counts["pM"], counts["uM"],
      counts["fM"]/len(all_iword_intersect)*100,
      counts["pM"]/len(all_iword_intersect)*100,
      counts["uM"]/len(all_iword_intersect)*100)

  # 最後統整整個 HTML 內容，並送出。
  return rHTML.format(caption, rIndexTableStr,rTableHeadStr, rTableBodyStr)


## 4. choice/sic/corr 比對
### 4.1 Get choice[sic]

In [40]:
def getChoice(xmlpath):
  """ 
    僅處理 choice+sic+corr
    抓取 sic vs. corr
    若sic/corr 直接包字，則用該字
    若sic/corr包字orig，改用 orig@reg 來表示
  """

  choiceData = {}

  # 解析XML Tree
  xmltree = etree.parse(xmlpath)
  root = xmltree.getroot()

  # 抓取所有origNode (必須包含sic)
  choiceNodes = xmltree.xpath(
      '//tei:choice[tei:sic]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

  # 逐一處理OrigNode
  for i, cnode in enumerate(choiceNodes):
    # 取的sic node 與 corr node
    sic_node = cnode.find("tei:sic" , namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
    corr_node = cnode.find("tei:corr", namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
    
    # sic_text 抓取
    sic_text = getCorrSicText(sic_node)
    corr_text = getCorrSicText(corr_node)

    # print(f'[{i}]:{sic_text} vs. {corr_text}')

    # 儲存 sic. vs. corr [雙向比對需求，因此同一筆資料將紀錄2次]
    # 結構為： choiceData={sir/corr:
    #                    {key:{"data":sic/corr, "count":c}, ..}
    #                 }
    appendToCountDict(choiceData, sic_text, corr_text, type="text")
    appendToCountDict(choiceData, corr_text, sic_text, type="text")

  return choiceData
  


In [12]:
def outputSubElements(node,lv=0):
  r = (" "*lv+etree.QName(node).localname+"-")
  for e in node:
    r+=outputSubElements(e)
  return r


### 4.2 Choice Compare

In [13]:
def choiceComp(origData, cmpOrder=[]):
  return countArrayCmp(origData, "通假字比較表", cmpOrder)

### 4.3 Get sic/corr 代表字
邏輯相同，但有點囉唆，因此獨立為函式

In [34]:
def getCorrSicText(node):
  text = None
  # 存在orig, 用 orig/@reg
  if len(node) > 0 and node[0].tag == "{http://www.tei-c.org/ns/1.0}orig":
    text = node[0].get("reg")
    if text == None:
      print(f"有orig, 但orig不存在@reg()")
      exit(1)
  else:  # 直接抓node內容
    text = etree.tostring(node, method="text",
                              encoding="utf-8", with_tail=False).decode("utf-8")
  return text


## 5. 主程式

In [42]:
MainHTMLStr="""
 <!DOCTYPE html >
  <html>
   <head>
      <title> {} </title>
      <meta charset = "UTF-8"/> <link rel="stylesheet" href="mystyle.css" />
    </head>
    <body>
      {}
    </body>
  </html >
"""

#----------------  缺字比較  ----------------
# Do Orig Compare
ALLOrigData = {}

for xmlfile in XMLFILEs:
  print ("======== {} ========".format(xmlfile))
  origs=getORIG(os.path.join(XMLDIR,xmlfile))
  ALLOrigData[xmlfile]=origs

orig_html_str = origComp(ALLOrigData)

# Final Output
with open(gcOUTPATH, "w", encoding="UTF-8") as ofile:
  ofile.write(MainHTMLStr.format("缺字比較", orig_html_str))
  print("缺字比較結果輸出結果至"+gcOUTPATH)

#----------------  通假字比較  ----------------
# Do Choice Compare
ALLChoiceData = {}
for xmlfile in XMLFILEs:
  print("======== {} ========".format(xmlfile))
  choices = getChoice(os.path.join(XMLDIR, xmlfile))
  ALLChoiceData[xmlfile] = choices

choice_html_str = countArrayCmp(ALLChoiceData, "通假字比較表")

# Final Output
with open(scOUTPATH, "w", encoding="UTF-8") as ofile:
  ofile.write(MainHTMLStr.format("通假字比較", choice_html_str))
  print("通假字比較結果輸出結果至"+gcOUTPATH)




<tr><th>No.</th><th>Reg</th><th>LIN-pomo-S-3491.xml</th><th>LIN-pomo-P-2187.xml</th><th>相似程度</th></tr>
<tr><th>No.</th><th>Reg</th><th>LIN-pomo-S-3491.xml</th><th>LIN-pomo-P-2187.xml</th><th>相似程度</th></tr>
輸出結果至./out/g-out.html
